hellaswag2.ts and loaded_hellaswag.spec.ts used for testing the loaded model on the whole HellaSwag

christinakopi · christinakopi · commit ae33116ec278 · 2025-06-30T20:44:02.000+02:00
diff --git a/discojs/src/models/gpt/loaded_hellaswag.spec.ts b/discojs/src/models/gpt/loaded_hellaswag.spec.ts
@@ -0,0 +1,61 @@
+import { expect } from 'chai';
+import path from 'path';
+import { AutoTokenizer, PreTrainedTokenizer } from '@xenova/transformers';
+import { GPT } from './index.js';
+import { GPTModel } from './model.js';
+import { loadWeightsFromJSON } from './load_weights.js';
+import { evaluate } from '../hellaswag2.js';
+
+describe('GPT Model with Pretrained Weights on Full HellaSwag', () => {
+
+  let gptForTest: GPT;
+  let tokenizer: PreTrainedTokenizer;
+
+  before(async function() {
+    this.timeout(2400000000000);
+
+    console.log('Setting up benchmark: loading model, weights, tokenizer...');
+    
+    console.time('Model+Tokenizer Loading Time');
+
+    const modelConfig = { 
+      modelType: 'gpt2' as const,
+      contextLength: 1024 
+    };
+
+    const loadedGptModel = new GPTModel(modelConfig);
+    const weightsFilename = 'gpt2_weights.jsonl';
+    const weightsFileUrl = new URL(path.resolve(weightsFilename), 'file://').href;
+
+    await loadWeightsFromJSON(loadedGptModel, weightsFileUrl);
+    
+    gptForTest = new GPT(modelConfig, loadedGptModel);
+    tokenizer = await AutoTokenizer.from_pretrained('Xenova/gpt2');
+
+    console.timeEnd('Model+Tokenizer Loading Time');
+    console.log('Setup complete.');
+  });
+  
+  after(() => {
+    console.log('Tearing down test suite: disposing of model...');
+    gptForTest?.[Symbol.dispose]();
+  });
+
+  it('evaluates the loaded model on the entire HellaSwag dataset', async () => {
+    console.log('\n--- Starting HellaSwag Benchmark ---');
+    
+    console.time('Evaluation Time on HellaSwag');
+    const accuracy = await evaluate(gptForTest, tokenizer, 10042, false);
+    console.timeEnd('Evaluation Time on HellaSwag');
+
+    console.log(`\n--- Benchmark Complete ---`);
+    console.log(`Final Accuracy on Full HellaSwag: ${(accuracy * 100).toFixed(2)}%`);
+
+    expect(accuracy).to.be.gt(0.20);
+    expect(accuracy).to.be.lt(0.30);
+
+    console.log(`Accuracy is: ${(accuracy * 100).toFixed(2)}%`);
+    console.log('Benchmark passed successfully.');
+
+  }).timeout(6000000000000);
+});
diff --git a/discojs/src/models/hellaswag2.ts b/discojs/src/models/hellaswag2.ts
@@ -0,0 +1,268 @@
+import { promises as fsPromises } from 'fs';
+import fetch from 'node-fetch';
+import * as tf from '@tensorflow/tfjs';
+import { GPT } from './index.js';
+import { tokenize } from '../processing/text.js';
+import { PreTrainedTokenizer } from '@xenova/transformers';
+import * as readline from 'readline';
+import { fileURLToPath } from 'url';
+import path from 'path';
+import fs from 'fs';
+import { List } from 'immutable';
+import { ONNXModel } from './onnx.js';
+
+
+const HELLASWAG_URL = 'https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl';
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+const LOCAL_FILE = path.resolve(__dirname, '../../../datasets/hellaswag_val.jsonl');
+
+
+async function fileExists(path_: string = LOCAL_FILE): Promise<boolean> {
+  try {
+    await fsPromises.access(path_);
+    return true;
+  } catch {
+    return false;
+  }
+}
+
+// Download the HellaSwag dataset if it doesn't exist locally
+async function downloadHellaSwag(path_: string = LOCAL_FILE): Promise<void> {
+  if (await fileExists(path_)) return;
+
+  const res = await fetch(HELLASWAG_URL);
+  const fileStream = fs.createWriteStream(path_);
+
+  await new Promise<void>((resolve, reject) => {
+    res.body?.pipe(fileStream);
+    res.body?.on('error', reject);
+    fileStream.on('error', reject);
+    fileStream.on('finish', () => resolve());
+  });
+}
+
+/**
+ * Represents a single example from the HellaSwag dataset.
+ *
+ * ctx - The context sentence or paragraph that sets up the situation.
+ * endings - An array of four possible continuations of the context.
+ * label - The index (0–3) of the correct ending in the `endings` array.
+ */
+interface HellaSwagExample {
+  ctx: string;
+  endings: string[];
+  label: number;
+}
+
+async function* loadExamples(limit = 100): AsyncGenerator<HellaSwagExample> {
+  
+  // Read the dataset line by line
+  const fileStream = fs.createReadStream(LOCAL_FILE, 'utf-8');
+  const rl = readline.createInterface({ input: fileStream, crlfDelay: Infinity });
+
+  let count = 0;
+  for await (const line of rl) {
+    // Stop if the desired number of examples has been reached
+    if (count++ >= limit) break;
+
+    try {
+      const data = JSON.parse(line.trim()) as HellaSwagExample;
+      yield { ctx: data.ctx, endings: data.endings, label: data.label };
+    } catch (e) {
+      console.error(`Failed to parse line ${count}:`, line);
+      throw e;
+    }
+  }
+}
+
+
+// DEBUGGING FUNCTION LOADS A SINGLE EXAMPLE
+async function* loadExample(limit = 1, lineNumber?: number): AsyncGenerator<HellaSwagExample> {
+  const fileStream = fs.createReadStream(LOCAL_FILE, 'utf-8');
+  const rl = readline.createInterface({ input: fileStream, crlfDelay: Infinity });
+
+  let count = 0;
+  for await (const line of rl) {
+    if (!line.trim()) continue;
+
+    if (lineNumber !== undefined) {
+      if (count === lineNumber) {
+        const data = JSON.parse(line.trim()) as HellaSwagExample;
+        yield { ctx: data.ctx, endings: data.endings, label: data.label };
+        break; // only one line
+      }
+    } else {
+      if (count >= limit) break;
+      const data = JSON.parse(line.trim()) as HellaSwagExample;
+      yield { ctx: data.ctx, endings: data.endings, label: data.label };
+    }
+
+    count++;
+  }
+}
+
+// Computes the log likelihood of the input sequence using the tfjs model
+// The input sequence is expected to be a concatenation of the context and the ending
+// The function computes the log likelihood of each ending and returns the one with the loss of each ending
+// Sources:
+// https://github.com/karpathy/build-nanogpt/blob/master/hellaswag.py
+//https://www.youtube.com/watch?v=l8pRSuU81PU
+async function computeLogLikelihood(gpt: GPT, inputIds: number[], ctxLength: number): Promise<number> {
+  const lossTensor =  tf.tidy(() => {
+    // Convert input sequence to shape [1, seq_len]
+    const inputTensor = tf.tensor2d([inputIds], [1, inputIds.length], 'int32');
+
+    // Get model logits: [1, seq_len, vocab_size]
+    const logits3D = gpt.extract().predict(inputTensor) as tf.Tensor3D;
+
+    // Shift logits to align with next-token targets
+    const shiftedLogits = logits3D.slice([0, 0, 0], [1, inputIds.length - 1, -1]);
+
+    // Target tokens (next tokens), same length as shifted logits
+    const shiftedTargets = inputIds.slice(1);
+    const targetTensor = tf.tensor1d(shiftedTargets, 'int32');
+
+    // One-hot encode targets for cross-entropy loss
+    const oneHotLabels = tf.oneHot(targetTensor, shiftedLogits.shape[2]);
+
+    // Compute per-token cross-entropy log-probabilities (unnormalized loss)
+    const logProbs = tf.losses.softmaxCrossEntropy(oneHotLabels, shiftedLogits.squeeze());
+
+    // Create a mask to only include loss after the context length
+    const mask = tf.tensor1d(inputIds.map((_, i) => (i >= ctxLength ? 1 : 0)), 'float32').slice(1);
+
+    // Apply the mask and average over the selected tokens
+    const masked = logProbs.mul(mask);
+    const loss = masked.sum().div(mask.sum());
+
+    return loss;
+  });
+  const lossNumber = await lossTensor.array();
+  if (typeof lossNumber !== 'number') {
+    throw new Error('got multiple loss')
+  }
+  return lossNumber;
+}
+
+
+// Computes the log likelihood of the input sequence using the ONNX model
+// The input sequence is expected to be a concatenation of the context and the ending
+// The function computes the log likelihood of each ending and returns the one with the loss of each ending
+// Sources:
+// https://github.com/karpathy/build-nanogpt/blob/master/hellaswag.py
+// https://www.youtube.com/watch?v=l8pRSuU81PU 
+async function computeONNXLogLikelihood(model: ONNXModel, inputIds: number[], ctxLength: number): Promise<number> {
+  const batchInput = List([List(inputIds)]); // [1, seq_len]
+
+  // Run model to get logits: flattened [T * V]
+  const logitsTensor = await model.getLogits(batchInput);
+  const logits = logitsTensor.data as number[];
+  const [_B, T, V] = logitsTensor.dims;
+
+  // Reshape flattened logits into [T][V]
+  const reshaped: number[][] = Array.from({ length: T }, (_, t) =>
+    logits.slice(t * V, (t + 1) * V)
+  );
+
+  // Shift targets (next-token prediction)
+  const targets = inputIds.slice(1);              // length = T - 1
+  const logitsShifted = reshaped.slice(0, T - 1); // also length = T - 1
+
+  // Compute per-token cross-entropy loss manually
+  const losses = logitsShifted.map((logit, i) => {
+    const maxLogit = Math.max(...logit);                  // for numerical stability
+    const exp = logit.map(x => Math.exp(x - maxLogit));
+    const sumExp = exp.reduce((a, b) => a + b, 0);
+    const probs = exp.map(e => e / sumExp);               // softmax
+    return -Math.log(probs[targets[i]]);                  // cross-entropy loss
+  });
+
+  // Create a binary mask for non-context tokens
+  const mask = inputIds.map((_, i) => (i >= ctxLength ? 1 : 0)).slice(1);
+
+  // Apply the mask to the losses
+  const maskedLosses = losses.map((l, i) => l * mask[i]);
+
+  // Average the masked losses
+  const totalLoss = maskedLosses.reduce((a, b) => a + b, 0);
+  const sum = mask.reduce((a, b) => a + b, 0 as number);
+
+  return totalLoss / (sum || 1); // avoid division by 0
+}
+
+
+type Tokenizer = PreTrainedTokenizer;
+type ModelType = GPT | ONNXModel;
+
+/**
+ * Evaluates the model on the HellaSwag dataset.
+ * model - The model to evaluate (either GPT or ONNXModel)
+ * tokenizer - The tokenizer to use for tokenizing the input text
+ * limit - The number of examples to evaluate on (default: 50)
+ * print - Whether to print the results (default: true)
+ * @returns The accuracy of the model on the dataset
+ */
+export async function evaluate(
+  model: ModelType,
+  tokenizer: Tokenizer,
+  limit = 50, // Number of examples to evaluate on (set to 10042 for all examples)
+  print = true,
+  dataset_path: string = LOCAL_FILE
+): Promise<number> {
+  await downloadHellaSwag(dataset_path);
+
+  let correct = 0;
+  let total = 0;
+
+for await (const example of loadExamples(limit)) {
+    const endingTokens = example.endings.map(e =>
+      tokenize(tokenizer, example.ctx + ' ' + e, {
+        truncation: true,
+        max_length: 128
+      }).toArray()
+    );
+  
+    const ctxTokens = tokenize(tokenizer, example.ctx, {
+      truncation: true,
+      max_length: 128
+    }).toArray();
+  
+    let losses: number[] = [];
+  
+    if (model instanceof GPT) {
+      losses = await Promise.all(
+        endingTokens.map(e =>
+          computeLogLikelihood(model, e, ctxTokens.length)
+        )
+      );
+    } else {
+      losses = await Promise.all(
+        endingTokens.map(e =>
+          computeONNXLogLikelihood(model, e, ctxTokens.length)
+        )
+      );
+    }
+  
+    const pred = losses.indexOf(Math.min(...losses));
+    if (pred === example.label) correct++;
+    total++;
+  
+    // Print the results 
+    if (print) {
+      console.log(`\nExample #${total}`);
+      console.log(`Context: ${example.ctx}`);
+      example.endings.forEach((end, i) => {
+        console.log(
+          `  ${i}: ${end}  (loss: ${losses[i].toFixed(4)})${i === example.label ? ' <-- correct' : ''}${i === pred ? ' <-- picked' : ''}`
+        );
+      });
+      const accuracy_temp = correct / total;
+      console.log(`\n Accuracy on ${total} examples: ${(accuracy_temp * 100).toFixed(2)}%`);
+    }
+  }
+
+  const accuracy = correct / total;
+  console.log(`\nFinal accuracy on ${total} examples: ${(accuracy * 100).toFixed(2)}%`);
+  return accuracy;
+}