epfml
diff --git a/‎cli/src/hellaswag_gpt.ts‎
Lines changed: 79 additions & 25 deletions b/‎cli/src/hellaswag_gpt.ts‎
Lines changed: 79 additions & 25 deletions
diff --git a/‎datasets/.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎datasets/.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎discojs/src/models/gpt/layers.spec.ts‎
Lines changed: 3 additions & 2 deletions b/‎discojs/src/models/gpt/layers.spec.ts‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎discojs/src/models/hellaswag.ts‎
Lines changed: 1 addition & 1 deletion b/‎discojs/src/models/hellaswag.ts‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎onnx-converter/.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎onnx-converter/.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎onnx-converter/README.md‎
Lines changed: 36 additions & 0 deletions b/‎onnx-converter/README.md‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎onnx-converter/package.json‎
Lines changed: 21 additions & 0 deletions b/‎onnx-converter/package.json‎
Lines changed: 21 additions & 0 deletions
@@ -1,50 +1,104 @@
+// import fs from 'fs';
+import fsPromise from 'node:fs/promises';
+
+import { dirname } from 'path';
+import { fileURLToPath } from 'url';
+import { parse } from 'ts-command-line-args'
+
 import '@tensorflow/tfjs-node';
 import fs from 'node:fs';
 import path from 'node:path';
-import { Tokenizer, models } from '@epfml/discojs';
+import { models, serialization, Tokenizer } from '@epfml/discojs';
 import { loadHellaSwag } from '@epfml/discojs-node';
+// import { AutoTokenizer } from '@xenova/transformers';
 
-const logFile = path.join('..', 'datasets', 'LogFile_hellaswag.txt');
-const logLines: string[] = [];
+const __dirname = dirname(fileURLToPath(import.meta.url));
 
+const logLines: string[] = [];
 function log(message: string) {
     console.log(message);
     logLines.push(message);
 }
 
-const hellaswagDataset: models.HellaSwagDataset = await loadHellaSwag(-1)
-
-async function evaluateTFJS(tokenizer: Tokenizer) {
-    const model = new models.GPT({ seed: 42 });
-    log('Evaluating TFJS GPT on HellaSwag...');
+async function evaluateModel(model: models.GPT | models.ONNXModel, numDataPoints = -1) {
+    const hellaswagDataset: models.HellaSwagDataset = await loadHellaSwag(numDataPoints)
+    const tokenizer = await Tokenizer.from_pretrained('Xenova/gpt2');
+    log('Starting the HellaSwag benchmark...');
 
     const start = Date.now();
-    const accuracy = await models.evaluate_hellaswag(model, tokenizer, hellaswagDataset, false);
+    const accuracy = await models.evaluate_hellaswag(model, tokenizer, hellaswagDataset, true);
     const duration = ((Date.now() - start) / 1000).toFixed(2);
 
-    log(`TFJS GPT Accuracy: ${(accuracy * 100).toFixed(2)}%`);
-    log(`TFJS GPT Evaluation Time: ${duration} seconds`);
+    log(`Final accuracy: ${(accuracy * 100).toFixed(2)}%`);
+    log(`Evaluation Time: ${duration} seconds`);
 }
 
-async function evaluateXenova(tokenizer: Tokenizer) {
-    const model = await models.ONNXModel.init_pretrained('Xenova/gpt2');
-    log('Evaluating Xenova GPT-2 (ONNX) on HellaSwag...');
+const ModelTypes = ['onnx', 'gpt-tfjs-random', 'gpt-tfjs-pretrained'] as const;
+type ModelType = typeof ModelTypes[number];
 
-    const start = Date.now();
-    const accuracy = await models.evaluate_hellaswag(model, tokenizer, hellaswagDataset, false);
-    const duration = ((Date.now() - start) / 1000).toFixed(2);
-
-    log(`Xenova GPT-2 Accuracy: ${(accuracy * 100).toFixed(2)}%`);
-    log(`Xenova GPT-2 Evaluation Time: ${duration} seconds`);
+interface HellaSwagArgs {
+    model: ModelType
+    numDataPoints: number
+    logFile: string
+    pretrainedModelPath: string
+    help?: boolean
 }
 
 async function main(): Promise<void> {
-    fs.writeFileSync(logFile, '', 'utf-8'); // Clear old log file
+    const defaultPretrainedModelPath = path.join(__dirname, "..", "..", "onnx-converter", "assets", "model.json")
+    const args = parse<HellaSwagArgs>({
+        model: {
+            type: (raw: string) => raw as ModelType,
+            description: `Model type, one of ${ModelTypes}`,
+            defaultValue: 'onnx'
+        },
+        numDataPoints: {
+            type: Number,
+            description: 'Number of HellaSwag datapoints to evaluate, set -1 for the whole benchmark',
+            defaultValue: -1
+        },
+        logFile: {
+            type: String,
+            description: 'Relative path to the log file, default to ./hellaswag.log', defaultValue: 'hellaswag.log'
+        },
+        pretrainedModelPath: {
+            type: String,
+            description: 'If specifying gpt-tfjs-pretrained, provide the relative path to the TF.js pretrained model',
+            defaultValue: defaultPretrainedModelPath
+        },
+        help: {
+            type: Boolean,
+            optional: true,
+            alias: 'h',
+            description: 'Prints this usage guide'
+        }
+    }, { helpArg: 'help' })
 
-    const tokenizer = await Tokenizer.from_pretrained('Xenova/gpt2');
-    await evaluateTFJS(tokenizer);
-    log('\n---\n');
-    await evaluateXenova(tokenizer);
+    const logFile = path.join(__dirname, args.logFile);
+    fs.writeFileSync(logFile, '', 'utf-8'); // Clear the log file
+
+    let model: | models.GPT | models.ONNXModel | undefined;
+    switch (args.model) {
+        case 'onnx':
+            log("Using ONNX pretrained model Xenova/gpt2")
+            model = await models.ONNXModel.init_pretrained('Xenova/gpt2');
+            break;
+            case 'gpt-tfjs-random':
+            log("Using GPT-TFJS with random initialization")
+            model = new models.GPT({ seed: 42 });
+            break;
+            case 'gpt-tfjs-pretrained':
+            log("Using GPT-TFJS with pretrained weights")
+            if (args.pretrainedModelPath === undefined) {
+                throw new Error("If choosing gpt-tfjs-pretrained, provide the relative path to the TF.js pretrained model `pretrainedModelPath")
+            }
+            const encodedModel = await fsPromise.readFile(args.pretrainedModelPath);
+            model = await serialization.model.decode(encodedModel) as models.GPT;
+            break;
+        default:
+            throw new Error(`Unrecognized model type: ${model}`);
+    } 
+    await evaluateModel(model, args.numDataPoints);
 
     fs.writeFileSync(logFile, logLines.join('\n'), 'utf-8');
     console.log(`\nResults written to ${logFile}`);
 
@@ -20,3 +20,6 @@
 
 # GDHF demo
 /tinder_dog/
+
+# HellaSwag benchmark
+hellaswag*
@@ -174,8 +174,9 @@ describe('GPT Layers', () => {
       name: 'testCSA',
       contextLength: 5,
       nHead: 2,
-      nEmbd: 8,          // divisible by nHead, so head size = 4
-      dropout: 0.0,      // no dropout for deterministic tests
+      nEmbd: 8,      // divisible by nHead, so head size = 4
+      attnDrop: 0.0, // no dropout for deterministic tests
+      residDrop: 0.0,
       nLayer: 2,
       seed: 42
     };
 
@@ -126,7 +126,7 @@ type ModelType = GPT | ONNXModel;
 export async function evaluate(
   model: ModelType,
   tokenizer: Tokenizer,
-  dataset: HellaSwagExample[],
+  dataset: HellaSwagDataset,
   print = true
 ): Promise<number> {
   let correct = 0;
 
@@ -0,0 +1,3 @@
+node_modules
+assets
+dist
@@ -0,0 +1,36 @@
+## Usage
+
+This workspace is currently used to convert ONNX [GPT-2 model](https://huggingface.co/Xenova/gpt2) to Tensorflow.js. On the one hand, ONNX allows converting pretrained models from PyTorch or Tensorflow to the ONNX format, therefore there currently exists many pretrained models in ONNX format. However, ONNX libraries currently only support inference. On the other hand, Tensorflow.js doesn't have a converter that can handle recent Transformers models (despite having a [converter](https://github.com/tensorflow/tfjs/tree/master/tfjs-converter)), but TF.js allows further training models.
+
+Therefore, we want to convert pretrained models such as GPT-2 from ONNX format to Tensorflow.js to further fine-tune them. You generate a TF.js `model.json` by running `npm run convert_onnx` in this workspace.
+
+What the script does is:
+1. Read the ONNX GPT-2 model from [Xenova's repository](https://huggingface.co/Xenova/gpt2)
+2. Use the ONNX protobuf definition to read the file and iterate through the model layers. The ONNX JavaScript protobuf comes from [this repository](https://github.com/microsoft/onnxruntime/blob/main/js/web/lib/onnxjs/).
+3. Convert all weights to TF.js tensors
+4. Init a TF.js model with the loaded weights and export the model
+
+Running `npm run convert_onnx` creates a GPT-tfjs `model.json` file in the `./assets/` folder.
+
+## ONNX JS protobuf
+
+The ONNX specification has limited support in JavaScript. We found an old JS implementation in the [ONNX Runtime Web repository](https://github.com/microsoft/onnxruntime/tree/main/js/web/lib/onnxjs/ort-schema/protobuf). We had to adapt their files as follows to be compatible with our newer environment:
+1. Copy `onnx.js` and `onnx.d.ts` from [the repository](https://github.com/microsoft/onnxruntime/tree/main/js/web/lib/onnxjs/ort-schema/protobuf) in `./onnx-converter/src/protobuf`
+2. Rename `onnx.js` to `onnx.cjs`
+3. Create `onnx-proto.js` as a wrapper around the protobuf definition:
+```js
+import { createRequire } from 'module';
+const require = createRequire(import.meta.url);
+const onnxModule = require('./onnx.cjs');
+
+export const onnx = onnxModule.onnx;
+export default onnxModule;
+```
+4. Create `onnx-proto.d.ts` with the matching TypeScript definition:
+```ts
+export { onnx } from './onnx.js';
+declare const onnxModule: {
+  onnx: typeof import('./onnx.js').onnx;
+};
+export default onnxModule;
+```
@@ -0,0 +1,21 @@
+{
+  "name": "onnx-converter",
+  "private": true,
+  "type": "module",
+  "main": "dist/gpt2_from_onnx.js",
+  "scripts": {
+    "convert_onnx": "npm run build && node dist/convert_onnx.js",
+    "build": "tsc && cp -r src/protobuf dist",
+    "lint": "npx eslint .",
+    "test": ": nothing"
+  },
+  "author": "",
+  "license": "ISC",
+  "dependencies": {
+    "@epfml/discojs-node": "*"
+  },
+  "devDependencies": {
+    "nodemon": "3",
+    "ts-command-line-args": "2"
+  }
+}