diff --git a/.gitignore b/.gitignore
index 579b12a..c2f45d8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -214,6 +214,7 @@ __marimo__/
 
 # personal files
 *technical_architecture.md
-*PLAN.md
+*test_outputs/
+*AGENTS.md
 *personal_experimentation/
 *uv.lock
\ No newline at end of file
diff --git a/config/lm_eval_test_config.yaml b/config/lm_eval_test_config.yaml
new file mode 100644
index 0000000..907b28b
--- /dev/null
+++ b/config/lm_eval_test_config.yaml
@@ -0,0 +1,10 @@
+model: hf
+model_args: pretrained=gpt2,dtype=float32
+tasks:
+  - hellaswag
+batch_size: 2
+num_fewshot: 0
+output_dir: test_outputs
+limit: 3
+device: cpu
+seed: 42
\ No newline at end of file
diff --git a/eval_converters/common/utils.py b/eval_converters/common/utils.py
index 677e22e..96b5bdf 100644
--- a/eval_converters/common/utils.py
+++ b/eval_converters/common/utils.py
@@ -1,4 +1,10 @@
-from schema.eval_types import Family, HfSplit
+from schema.eval_types import (
+    BitPrecision, 
+    Family, 
+    HfSplit, 
+    QuantizationMethod, 
+    QuantizationType)
+from transformers import AutoConfig
 
 def detect_family(model_name: str) -> Family:
     """Return the Family enum if any of its values is a substring of model_name."""
@@ -25,4 +31,77 @@ def detect_hf_split(split_str: str) -> HfSplit:
     elif "train" in s:
         return HfSplit.train
     else:
-        return HfSplit.validation
\ No newline at end of file
+        return HfSplit.validation
+
+def infer_quantization_from_model_name(model_name_or_path: str) -> tuple[BitPrecision, QuantizationMethod, QuantizationType]:
+    pass
+
+def infer_quantization_from_model_config(model_name_or_path: str) -> tuple[BitPrecision, QuantizationMethod, QuantizationType]:
+    pass
+
+def infer_quantization(model_name_or_path: str) -> tuple[BitPrecision, QuantizationMethod, QuantizationType]:
+    try:
+        cfg = AutoConfig.from_pretrained(model_name_or_path)
+    except Exception as e:
+        return BitPrecision.none, QuantizationMethod.none, QuantizationType.none
+
+    qcfg = getattr(cfg, 'quantization_config', None)
+    if not qcfg:
+        return BitPrecision.none, QuantizationMethod.none, QuantizationType.none
+
+    bits = int(qcfg.get("bits") or qcfg.get("weight_bits") or qcfg.get("q_bits"))
+
+    if bits == 8:
+        precision = BitPrecision.int8
+    elif bits == 4:
+        precision = BitPrecision.int4
+    elif bits == 16:
+        precision = BitPrecision.float16
+    elif bits == 32:
+        precision = BitPrecision.float32
+    else:
+        precision = BitPrecision.none
+
+    method_key = str(qcfg.get("quant_method") or "").lower()
+
+    method_map = {
+        "gptq": QuantizationMethod.gptq,
+        "awq": QuantizationMethod.awq,
+    }
+
+    type_map = {
+        "gptq": QuantizationType.static,
+        "awq": QuantizationType.static,
+        "bitsandbytes": QuantizationType.dynamic,
+        "quanto": QuantizationType.static,
+        "hqq": QuantizationType.static,
+        "torchao": QuantizationType.static,
+    }
+
+    qmethod = method_map.get(method_key, QuantizationMethod.none)
+    qtype = type_map.get(method_key, QuantizationType.none)
+    return precision, qmethod, qtype
+
+def extract_context_window_from_config(model):
+    try:
+        config = AutoConfig.from_pretrained(model)
+
+        priority_fields = [
+            "max_position_embeddings",
+            "n_positions",
+            "seq_len",
+            "seq_length",
+            "n_ctx",
+            "sliding_window"
+        ]
+
+        context_window = next((getattr(config, f) for f in priority_fields if hasattr(config, f)), None)
+        if context_window is None:
+            context_window = 1
+    
+    except Exception as e:
+        print(f"Error getting context window: {e}")
+        context_window = 1
+    
+    finally:
+        return context_window
\ No newline at end of file
diff --git a/eval_converters/helm/adapter.py b/eval_converters/helm/adapter.py
index 008c110..1f0f3b9 100644
--- a/eval_converters/helm/adapter.py
+++ b/eval_converters/helm/adapter.py
@@ -17,7 +17,7 @@
 from schema import SCHEMA_VERSION
 
 from eval_converters.common.adapter import BaseEvaluationAdapter, AdapterMetadata, SupportedLibrary
-from eval_converters.common.utils import detect_family, detect_hf_split
+from eval_converters.common.utils import detect_family, detect_hf_split, infer_quantization, extract_context_window_from_config
 from .utils import detect_prompt_class, get_adapter_class_from_method_string
 
 from transformers import AutoConfig
@@ -25,49 +25,6 @@
 # run this just once in your process to initialize the registry
 register_builtin_configs_from_helm_package()
 
-def infer_quantization(model_name_or_path: str):
-	"""
-	Returns (BitPrecision, Method) enums for the given HF model.
-	"""
-	try:
-		cfg = AutoConfig.from_pretrained(model_name_or_path)
-	except Exception as e:
-		raise ValueError(
-			f"Failed to load model config for {model_name_or_path}: {e} \n"
-			"This may happen if you are using a HELM model name instead of HuggingFace model name in the adapter_spec.model field."
-			"For example, HELM uses 'meta/llama-3.1-8b-instruct' while HuggingFace uses meta-llama/llama-3.1-8b-instruct' \n"
-			"Please verify the model name and try again."
-		)
-	qcfg = getattr(cfg, "quantization_config", None)
-
-	if qcfg is None:
-		return BitPrecision.none, Method.None_
-	
-	bits = int(qcfg.get("bits") or qcfg.get("weight_bits") or qcfg.get("q_bits"))
-	
-	if bits == 8:
-		precision = BitPrecision.int8
-	elif bits == 4:
-		precision = BitPrecision.int4
-	elif bits == 16:
-		precision = BitPrecision.float16
-	elif bits == 32:
-		precision = BitPrecision.float32
-	else:
-		precision = BitPrecision.none
-
-	method_key = qcfg.get("quant_method") or ""
-	method_map = {
-		"gptq": Method.static,
-		"awq": Method.static,
-		"bitsandbytes": Method.dynamic,
-		"quanto": Method.static,
-		"hqq": Method.static,
-		"torchao": Method.static,
-	}
-
-	method = method_map.get(method_key, Method.None_)
-	return precision, method
 
 class HELMAdapter(BaseEvaluationAdapter):
 	"""
@@ -148,33 +105,14 @@ def transform_from_directory(self, dir_path):
 		)
 
 		# 1.2. Configuration
-		# HELM does not provide context window size, try loading it from model config, else set to 1
-		try:
-			# try getting context window from model deployment
-			deployment = get_model_deployment(adapter_spec.model_deployment)
-			if deployment and deployment.max_sequence_length is not None:
-				context_window = deployment.max_sequence_length
-
-			# if not available, try loading it from model config
-			else:
-				config = AutoConfig.from_pretrained(adapter_spec.model)
-
-				priority_fields = [
-					"max_position_embeddings",
-					"n_positions",
-					"seq_len",
-					"seq_length",
-					"n_ctx",
-					"sliding_window"
-				]
-
-				context_window = next((getattr(config, f) for f in priority_fields if hasattr(config, f)), None)
-				if context_window is None:
-					context_window = 1
-		
-		except Exception as e:
-			print(f"Error getting context window: {e}")
-			context_window = 1
+		# HELM does not provide context window size, try loading it from model deployment, else set to 1
+		deployment = get_model_deployment(adapter_spec.model_deployment)
+		if deployment and deployment.max_sequence_length is not None:
+			context_window = deployment.max_sequence_length
+
+		# if not available, try loading it from model config, else set to 1
+		else:
+			context_window = extract_context_window_from_config(adapter_spec.model)
 
 		configuration = Configuration(
 			context_window=context_window,
@@ -336,33 +274,14 @@ def _transform_single(self, raw_data, base_dir=None):
 		)
 
 		# 1.2. Configuration
-		# HELM does not provide context window size, try loading it from model config, else set to 1
-		try:
-			# try getting context window from model deployment
-			deployment = get_model_deployment(adapter_spec.model_deployment)
-			if deployment and deployment.max_sequence_length is not None:
-				context_window = deployment.max_sequence_length
-
-			# if not available, try loading it from model config
-			else:
-				config = AutoConfig.from_pretrained(adapter_spec.model)
-
-				priority_fields = [
-					"max_position_embeddings",
-					"n_positions",
-					"seq_len",
-					"seq_length",
-					"n_ctx",
-					"sliding_window"
-				]
-
-				context_window = next((getattr(config, f) for f in priority_fields if hasattr(config, f)), None)
-				if context_window is None:
-					context_window = 1
-		
-		except Exception as e:
-			print(f"Error getting context window: {e}")
-			context_window = 1
+		# HELM does not provide context window size, try loading it from model deployment
+		deployment = get_model_deployment(adapter_spec.model_deployment)
+		if deployment and deployment.max_sequence_length is not None:
+			context_window = deployment.max_sequence_length
+
+		# if not available, try loading it from model config, else set to 1
+		else:
+			context_window = extract_context_window_from_config(adapter_spec.model)
 
 		configuration = Configuration(
 			context_window=context_window,
diff --git a/eval_converters/helm/utils.py b/eval_converters/helm/utils.py
index 247cfd2..6594436 100644
--- a/eval_converters/helm/utils.py
+++ b/eval_converters/helm/utils.py
@@ -59,4 +59,4 @@ def get_adapter_class_from_method_string(method_str: str) -> type[Adapter]:
         if key in method_str:
             return mapping[key]
 
-    raise ValueError(f"Unknown adapter method string: {method_str}")
+    raise ValueError(f"Unknown adapter method string: {method_str}")
\ No newline at end of file
diff --git a/eval_converters/lm_eval/adapter.py b/eval_converters/lm_eval/adapter.py
new file mode 100644
index 0000000..ea4b0e7
--- /dev/null
+++ b/eval_converters/lm_eval/adapter.py
@@ -0,0 +1,302 @@
+from __future__ import annotations
+import logging
+import os
+import json
+
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Iterable, Union
+
+import yaml
+
+from schema import SCHEMA_VERSION
+from schema.eval_types import (
+	EvaluationResult,
+	ModelInfo,
+	Configuration,
+	GenerationArgs,
+	InferenceSettings,
+	Instance,
+	Output,
+	Evaluation,
+	EvaluationMethod,
+	PromptConfig,
+	PromptClass,
+	TaskType,
+	SampleIdentifier,
+	Quantization,
+	Model,
+	InstructionPhrasing,
+	QuantizationType,
+)
+
+from eval_converters.common.adapter import BaseEvaluationAdapter, AdapterMetadata, SupportedLibrary
+from eval_converters.common.utils import detect_family, detect_hf_split, infer_quantization, extract_context_window_from_config
+from .utils import detect_prompt_class, MAIN_METRIC_BY_TASK, MULTIPLE_CHOICE_TASKS
+
+# Core Adapter
+
+class LMEvalAdapter(BaseEvaluationAdapter):
+
+	# dumped config file: config.yaml 
+	# samples file: samples_TIMESTAMP.jsonl generated by lm-eval during execution
+	# results file: results_TIMESTAMP.json generated by lm-eval during execution
+
+	CONFIG_FILE = "config.yaml"
+	RESULTS_FILE = "results.json"
+	SAMPLES_FILE = "samples.jsonl"
+
+	@property
+	def metadata(self) -> AdapterMetadata:
+		return AdapterMetadata(
+			name="LMEvalAdapter",
+			version="0.0.1",
+			supported_library_versions=["0.4.0", "0.5.0", "0.5.1"],
+			description="Adapter for transforming LM-Eval evaluation outputs to unified schema format"
+		)
+
+	@property
+	def supported_library(self) -> SupportedLibrary:
+		return SupportedLibrary.LM_EVAL
+	
+	def _load_file(self, file_path: Path) -> Any:
+		if file_path.suffix == ".jsonl":
+			return [json.loads(line) for line in file_path.read_text().splitlines()]
+
+	def transform_from_directory(self, dir_path: Union[str, Path]) -> List[EvaluationResult]:
+		dir_path = Path(dir_path)
+		if not dir_path.is_dir():
+			raise FileNotFoundError(f"Directory {dir_path} does not exist")
+
+		cfg_path = os.path.join(dir_path, self.CONFIG_FILE)
+		cfg: Dict[str, Any] = {}
+		if os.path.exists(cfg_path):
+			with open(cfg_path, "r", encoding="utf-8") as f:
+				cfg = yaml.safe_load(f)
+
+		else:
+			logging.warning(f"config.yaml not found - falling back to default config")
+
+		# Extract model name from config or fallback to directory name
+		if isinstance(cfg.get("model_args"), dict):
+			model_name = cfg["model_args"].get("pretrained", cfg.get("model", "unknown-model"))
+
+		elif isinstance(cfg.get("model_args"), str) and "pretrained=" in cfg.get("model_args", ""):
+			# Extract from string format: "pretrained=gpt2,dtype=float32"
+			for part in cfg["model_args"].split(","):
+				if part.strip().startswith("pretrained="):
+					model_name = part.split("=", 1)[1].strip()
+					break
+			else:
+				model_name = cfg.get("model", "unknown-model")
+		else:
+			# Fallback to directory name if no model info in config
+			model_name = dir_path.name if dir_path.name != "." else "unknown-model"
+
+		precision, quant_method, quant_type = infer_quantization(model_name)
+
+		generation_args = GenerationArgs(
+			temperature = cfg.get("temperature", 0.0),
+			top_p = cfg.get("top_p", 1.0),
+			top_k = cfg.get("top_k", 20),
+			max_tokens = cfg.get("max_tokens"),
+		)
+
+		inference_settings = InferenceSettings(
+			quantization = Quantization(bit_precision=precision, method=quant_method, type=quant_type),
+			generation_args = generation_args,
+		)
+
+		context_window = extract_context_window_from_config(model_name)
+
+		model_block = Model(
+			model_info = ModelInfo(
+				name=model_name,
+				provider = (model_name.split("/", 1)[0] if "/" in model_name else "unknown"),
+				family = detect_family(model_name),
+			),
+			configuration = Configuration(
+				context_window = context_window,
+			),
+			inference_settings = inference_settings,
+		)
+
+		# Load task-level metrics
+		task_scores: Dict[str, Dict[str, float]] = {}
+		results_path = self._find_first_file(dir_path, [self.RESULTS_FILE])
+
+		if results_path:
+			with open(results_path, "r", encoding="utf-8") as f:
+				results = json.load(f)
+
+		# Enumerate per-instance samples
+		pred_path = self._find_first_file(dir_path, [self.SAMPLES_FILE])
+		if pred_path is None:
+			raise FileNotFoundError("No samples file found")
+
+		evaluations: List[EvaluationResult] = []
+		with open(pred_path, "r", encoding="utf-8") as f:
+			for line_idx, line in enumerate(f):
+				if not line.strip():
+					continue
+
+				record = json.loads(line)
+
+				# Extract task name from record, config, or filename
+				task_name = record.get("task")
+				if not task_name:
+					# Try from config
+					tasks = cfg.get("tasks", [])
+					if tasks:
+						task_name = tasks[0]
+					else:
+						# Try to extract from filename (e.g., "samples_hellaswag_2025...")
+						filename = pred_path.name
+						if "_" in filename:
+							parts = filename.split("_")
+							for part in parts:
+								if part in MAIN_METRIC_BY_TASK or part.lower() in MULTIPLE_CHOICE_TASKS:
+									task_name = part
+									break
+				
+				if not task_name:
+					task_name = "unknown_task"
+				prompt_class = detect_prompt_class(task_name)
+
+				# Provide a default instruction phrasing so PromptConfig validates
+				if prompt_class == PromptClass.MultipleChoice:
+					instruction_text = "Choose the correct answer from the options."
+				elif prompt_class == PromptClass.OpenEnded:
+					instruction_text = "Provide a helpful, concise answer."
+				else:
+					instruction_text = "Complete the prompt appropriately."
+				default_phrasing = InstructionPhrasing(name="default", text=instruction_text)
+
+				raw_inp = record.get("input") or record.get("question") or record.get("ctx") or ""
+				ground_truth = record.get("label") or record.get("answers", [None])[0] or record.get("target") or ""
+				prediction = record.get("prediction") or record.get("pred") or record.get("decoded") or record.get("response") or ""
+
+				# compute 0/1 correctness if not provided
+				if "correct" in record:
+					score = 1.0 if record["correct"] else 0.0
+				else:
+					# some tasks provide numeric labels - normalize to str equality
+					score = 1.0 if str(prediction).strip() == str(ground_truth).strip() else 0.0
+
+				# Allow task-level main metric override
+				metric_name = MAIN_METRIC_BY_TASK.get(task_name, "accuracy")
+				if metric_name in record:
+					# use per-instance metric score if available (e.g., acc_norm for hellaswag)
+					score = record[metric_name]
+
+				evaluation = Evaluation(
+					evaluation_method = EvaluationMethod(
+						method_name = "lm-eval-harness",
+						description = "0-1 correctness computed from per-instance prediction",
+					),
+					ground_truth = str(ground_truth),
+					score = score,
+					classification_fields = self._build_classification_fields(record, ground_truth) if prompt_class == PromptClass.MultipleChoice else None
+				)
+
+				instance = Instance(
+					task_type = TaskType.classification if prompt_class == PromptClass.MultipleChoice else TaskType.generation,
+					raw_input = str(raw_inp),
+					language = "en",					# harness tasks are predominantly English - override if known 
+					sample_identifier = SampleIdentifier(
+						dataset_name = task_name,
+						hf_repo = "", # not available in harness
+						hf_split = detect_hf_split(record.get("split", "test")),
+						hf_index = int(record.get("idx", line_idx)),
+					),
+					classification_fields = self._build_classification_fields(record, ground_truth) if prompt_class == PromptClass.MultipleChoice else None
+				)
+
+				evaluations.append(EvaluationResult(
+					schema_version = SCHEMA_VERSION,
+					evaluation_id = f"{task_name}:{record.get("idx", line_idx)}",
+					model = model_block,
+					prompt_config = PromptConfig(prompt_class=prompt_class, instruction_phrasing=default_phrasing),
+					instance = instance,
+					output = Output(response = str(prediction)),
+					evaluation = evaluation,
+				))
+			
+		return evaluations
+
+	@staticmethod
+	def _build_classification_fields(rec: Dict[str, Any], ground_truth: str) -> Dict[str, Any]:
+		choices = rec.get("choices") or rec.get("options") or rec.get("mc_options")
+		if choices and isinstance(choices, (list, tuple)):
+			formatted = [
+				{"id": str(i), "text": str(c)} for i,c in enumerate(choices)
+			]
+
+			return {
+				"full_input": rec.get("input") or rec.get("question") or rec.get("ctx") or "",
+				"question": rec.get("question") or rec.get("input", ""),
+				"choices": formatted,
+				"ground_truth": {"id": str(ground_truth), "text": choices[int(ground_truth)]} if isinstance(ground_truth, int) else {"text": ground_truth},
+			}
+
+		return {}
+
+	@staticmethod
+	def _find_first_file(root: Path, names: Iterable[str]) -> Optional[Path]:
+		import glob
+		
+		# First try exact matches
+		for name in names:
+			p = root / name
+			if p.exists():
+				return p
+		
+		# Then try glob patterns for timestamped files based on requested file type
+		for name in names:
+			if name.endswith('.jsonl'):
+				# Looking for samples file
+				matches = list(root.glob("samples_*.jsonl"))
+				if matches:
+					return matches[0]
+			elif name.endswith('.json'):
+				# Looking for results file
+				matches = list(root.glob("results_*.json"))
+				if matches:
+					return matches[0]
+
+		# recursively search one level down (each task often has its own folder)
+		for sub_dir in root.iterdir():
+			if sub_dir.is_dir():
+				# Try exact names first
+				for name in names:
+					p = sub_dir / name
+					if p.exists():
+						return p
+				
+				# Try patterns in subdirectories based on requested file type
+				for name in names:
+					if name.endswith('.jsonl'):
+						matches = list(sub_dir.glob("samples_*.jsonl"))
+						if matches:
+							return matches[0]
+					elif name.endswith('.json'):
+						matches = list(sub_dir.glob("results_*.json"))
+						if matches:
+							return matches[0]
+
+		return None
+
+	def _transform_single(self, raw_data: Union[str, Dict[str, Any]]) -> List[EvaluationResult]:
+		# lm_eval already works with a single config yaml file,
+		# so, we don't need to support single-dict transform, if given path -> transform_from_directory
+		
+		if isinstance(raw_data, dict):
+			raise ValueError("Single-dict transform is unsupported")
+
+		if isinstance(raw_data, str) and os.path.isfile(raw_data):
+			tmp_dir = Path(raw_data).parent
+			return self.transform_from_directory(tmp_dir)
+
+		raise ValueError(f"Unsupported raw_data type for LMEvalAdapter: {type(raw_data)}")
+
+
+	
diff --git a/eval_converters/lm_eval/converter.py b/eval_converters/lm_eval/converter.py
index e69de29..daeffd5 100644
--- a/eval_converters/lm_eval/converter.py
+++ b/eval_converters/lm_eval/converter.py
@@ -0,0 +1,82 @@
+from __future__ import annotations
+
+import os
+import subprocess
+from pathlib import Path
+from typing import Any, Dict, List, Union
+
+import yaml
+
+
+class LMEvalRunner:         # noqa: D101
+    def __init__(self, config_path: Union[str, Path]):
+        self.config_path = Path(config_path)
+        with open(self.config_path, "r", encoding="utf-8") as f:
+            self.cfg: Dict[str, Any] = yaml.safe_load(f)
+
+        # Setup output directory at root level
+        self.output_dir = Path(self.cfg.get("output_dir", "outputs"))
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+
+    def _build_cli(self) -> List[str]:
+        model = self.cfg.get("model", "hf")
+
+        # Build model_args string from config
+        model_args = self.cfg.get("model_args", "")
+        if isinstance(model_args, dict):
+            model_args = ",".join([f"{k}={v}" for k, v in model_args.items()])
+
+        # Accept tasks as list or string
+        raw_tasks = self.cfg.get("tasks", [])
+        if isinstance(raw_tasks, (list, tuple)):
+            tasks = ",".join(str(t) for t in raw_tasks)
+        else:
+            tasks = str(raw_tasks)
+
+        batch_size = str(self.cfg.get("batch_size", 1))
+        device = self.cfg.get("device", "cuda" if self.cfg.get("model") == "hf" else "cpu")
+
+        cli = [
+            "lm-eval",
+            "--model", model,
+            "--model_args", model_args,
+            "--tasks", tasks,
+            "--batch_size", batch_size,
+            "--output_path", str(self.output_dir),
+            "--device", device,
+            "--log_samples",  # This creates predictions.jsonl
+        ]
+
+        # Add optional parameters
+        if self.cfg.get("num_fewshot") is not None:
+            cli.extend(["--num_fewshot", str(self.cfg["num_fewshot"])])
+
+        if self.cfg.get("limit"):
+            cli.extend(["--limit", str(self.cfg["limit"])])
+
+        if self.cfg.get("temperature"):
+            cli.extend(["--gen_kwargs", f"temperature={self.cfg['temperature']}"])
+
+        if self.cfg.get("apply_chat_template"):
+            cli.append("--apply_chat_template")
+
+        if self.cfg.get("seed"):
+            cli.extend(["--seed", str(self.cfg["seed"])])
+
+        return cli
+
+    def run(self) -> None:          # noqa: D401
+        cli = self._build_cli()
+        proc = subprocess.run(cli, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
+        if proc.returncode != 0:
+            raise RuntimeError(f"LMEval failed with exit code {proc.returncode}. Output: {proc.stdout}")
+        print(proc.stdout)
+        
+        # Write config to model subdirectory where adapter expects it
+        if "pretrained=" in str(self.cfg.get("model_args", "")):
+            model_name = str(self.cfg["model_args"]).split("pretrained=")[1].split(",")[0]
+            
+            model_dir = Path(self.output_dir) / model_name
+            if model_dir.exists():
+                with open(model_dir / "config.yaml", "w") as f:
+                    yaml.safe_dump(self.cfg, f)
diff --git a/eval_converters/lm_eval/utils.py b/eval_converters/lm_eval/utils.py
new file mode 100644
index 0000000..cec80a6
--- /dev/null
+++ b/eval_converters/lm_eval/utils.py
@@ -0,0 +1,31 @@
+from typing import Dict
+from schema.eval_types import PromptClass
+
+MULTIPLE_CHOICE_TASKS = {
+    "hellaswag",
+    "piqa",
+    "siqa",
+    "winogrande",
+    "openbookqa",
+    "arc_easy",
+    "arc_challenge",
+    "boolq",
+    "copa",
+    "wic",
+    "anli_r1",
+    "anli_r2",
+    "anli_r3",
+}
+
+MAIN_METRIC_BY_TASK: Dict[str, str] = {
+    **{t: "acc_norm" for t in ["hellaswag", "copa", "arc_easy", "arc_challenge"]},
+    **{t: "acc" for t in ["piqa", "boolq", "winogrande", "openbookqa", "wic"]},
+    # generative tasks often expose `exact_match` / `bleu` - handled ad-hoc
+}
+
+def detect_prompt_class(task_name: str) -> PromptClass:
+    name = task_name.lower()
+    if name in MULTIPLE_CHOICE_TASKS:
+        return PromptClass.MultipleChoice
+    return PromptClass.OpenEnded
+    
diff --git a/pyproject.toml b/pyproject.toml
index dc5edf3..84f78cf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,3 +28,7 @@ dev = [
 [tool.setuptools.packages.find]
 include = ["helm*", "schema*", "common*", "config*", "eval_converters*"]
 exclude = ["tests*"]
+
+[build-system]
+requires = ["setuptools>=61", "wheel"]
+build-backend = "setuptools.build_meta"
diff --git a/schema/eval_types.py b/schema/eval_types.py
index 98e4eed..24a5eb9 100644
--- a/schema/eval_types.py
+++ b/schema/eval_types.py
@@ -5,9 +5,9 @@
 from __future__ import annotations
 
 from enum import Enum
-from typing import Any, Dict, List, Optional, Union
+from typing import Annotated, Any, Dict, List, Optional, Union
 
-from pydantic import BaseModel, ConfigDict, Field, confloat, conint
+from pydantic import BaseModel, ConfigDict, Field
 
 
 class Family(Enum):
@@ -27,6 +27,9 @@ class ModelInfo(BaseModel):
     name: str = Field(
         ..., description="Model name and version (e.g., 'Llama-2-13b-chat-hf')"
     )
+    provider: str = Field(
+        ..., description="Name of the provider that shared the model used for evaluation'"
+    )
     family: Optional[Family] = Field(None, description='Model family')
 
 
@@ -41,10 +44,10 @@ class Configuration(BaseModel):
     architecture: Optional[Architecture] = Field(
         None, description='Model architecture type'
     )
-    parameters: Optional[conint(ge=1)] = Field(
+    parameters: Optional[Annotated[int, Field(ge=1)]] = Field(
         None, description='Number of parameters in billions'
     )
-    context_window: conint(ge=1) = Field(
+    context_window: Optional[Annotated[int, Field(ge=1)]] = Field(
         ..., description='Maximum context window size in tokens'
     )
     is_instruct: Optional[bool] = Field(
@@ -62,15 +65,21 @@ class BitPrecision(Enum):
     float32 = 'float32'
 
 
-class Method(Enum):
-    None_ = 'None'
+class QuantizationType(Enum):
+    none = 'None'
     dynamic = 'dynamic'
     static = 'static'
 
+class QuantizationMethod(Enum):
+    awq = 'AWQ'
+    gptq = 'GPTQ'
+    none = 'None'
+
 
 class Quantization(BaseModel):
-    bit_precision: BitPrecision = Field(..., description='Quantization bit precision')
-    method: Method = Field(..., description='Quantization method')
+    bit_precision: Optional[BitPrecision] = Field(..., description='Quantization bit precision')
+    method: Optional[QuantizationMethod] = Field(..., description='Quantization type (algorithm) like gptq, awq, bitsandbyted, so on...')
+    type: Optional[QuantizationType] = Field(..., description='Quantization type (static or dynamic)')
 
 
 class GenerationArgs(BaseModel):
@@ -80,12 +89,30 @@ class GenerationArgs(BaseModel):
     temperature: Optional[float] = Field(None, description='Sampling temperature')
     top_p: Optional[float] = Field(None, description='Nucleus sampling parameter')
     top_k: Optional[float] = Field(None, description='Top-k sampling parameter')
-    max_tokens: Optional[conint(ge=1)] = Field(
+    max_tokens: Optional[Annotated[int, Field(ge=1)]] = Field(
         None, description='Maximum number of tokens to generate'
     )
     stop_sequences: Optional[List[str]] = Field(
         [], description='Sequences that stop generation'
     )
+    seed: Optional[float] = Field(
+        5.0, description='Random seed'
+    )
+    frequency_penalty: Optional[float] = Field(
+        0.0, description='Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model’s likelihood to repeat the same line verbatim'
+    )
+    presence_penalty: Optional[float] = Field(
+        0.0, description='Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model’s likelihood to talk about new topics.'
+    )
+    logit_bias: Optional[Dict[int, float]] = Field(
+        0.0, description='Map token Ids to an associated bias value'
+    )
+    logprobs: Optional[bool] = Field(
+        False, description='Return log probabilities of the output tokens'
+    )
+    top_logprobs: Optional[int] = Field(
+        1, description='Number of most likely tokens (0-20) to return at each token position'
+    )
 
 
 class InferenceSettings(BaseModel):
@@ -156,11 +183,10 @@ class Dimensions(BaseModel):
     enumerator: Enumerator = Field(
         ..., description='Style of enumeration for multiple choice options'
     )
-    instruction_phrasing: InstructionPhrasing
     separator: Separator = Field(
         ..., description='Character(s) used to separate multiple choice options'
     )
-    shots: conint(ge=0, le=10) = Field(
+    shots: Optional[Annotated[int, Field(ge=0, le=10)]] = Field(
         ..., description='Number of examples provided in the prompt'
     )
 
@@ -169,6 +195,7 @@ class PromptConfig(BaseModel):
     prompt_class: PromptClass = Field(
         ..., description='Type of task and its formatting requirements'
     )
+    instruction_phrasing: InstructionPhrasing
     dimensions: Optional[Dimensions] = Field(
         None, description='Format-specific configuration dimensions'
     )
@@ -193,9 +220,8 @@ class HfSplit(Enum):
 class SampleIdentifier(BaseModel):
     dataset_name: str = Field(..., description='Name of the source dataset')
     hf_repo: str = Field(..., description='HuggingFace repository identifier')
-    hf_split: HfSplit = Field(..., description='HuggingFace split identifier')
     hf_index: int = Field(..., description='Index in the HuggingFace dataset')
-
+    hf_split: Optional[HfSplit] = Field(..., description='HuggingFace split identifier')
 
 class PromptLogprob(BaseModel):
     token_id: float = Field(
@@ -290,7 +316,7 @@ class Evaluation(BaseModel):
         description='Method used to evaluate the answer, including predefined methods and user-defined methods.',
     )
     ground_truth: str = Field(..., description='The correct answer for evaluation')
-    score: confloat(ge=0.0, le=1.0) = Field(
+    score: Annotated[float, Field(ge=0.0, le=1.0)] = Field(
         ...,
         description="Binary score indicating whether the model's answer was correct (1.0) or incorrect (0.0)",
     )
diff --git a/tests/test_lm_eval_adapter.py b/tests/test_lm_eval_adapter.py
new file mode 100644
index 0000000..2cbe6c1
--- /dev/null
+++ b/tests/test_lm_eval_adapter.py
@@ -0,0 +1,99 @@
+from pathlib import Path
+import json
+import textwrap
+import yaml
+
+import pytest
+
+from eval_converters.lm_eval.adapter import LMEvalAdapter
+
+def create_tmp_lm_eval_dir(tmp_path: Path) -> Path:
+    """Create a temporary directory with mock lm-eval output files"""
+    # Ensure the directory exists
+    tmp_path.mkdir(parents=True, exist_ok=True)
+    
+    # config.yaml
+    cfg = {
+        "model": "hf-causal",
+        "model_args": {"pretrained": "gpt2"},
+        "tasks": ["hellaswag"],
+        "temperature": 0.7,
+    }
+    (tmp_path / "config.yaml").write_text(yaml.safe_dump(cfg), encoding="utf-8")
+
+    # results.json (task-level)
+    (tmp_path / "results.json").write_text(json.dumps({
+        "hellaswag": {
+            "acc_norm": 0.75,
+        }
+    }), encoding="utf-8")
+
+    # samples.jsonl - two examples
+    preds = textwrap.dedent(
+        """
+        {"task": "hellaswag", "idx": 0, "input": "Q1", "choices": ["A", "B", "C", "D"], "label": 2, "prediction": 2, "correct": true}
+        {"task": "hellaswag", "idx": 1, "input": "Q2", "choices": ["A", "B", "C", "D"], "label": 1, "prediction": 3, "correct": false}
+        """
+    ).strip()
+    (tmp_path / "samples.jsonl").write_text(preds, encoding="utf-8")
+
+    return tmp_path
+
+
+@pytest.fixture
+def tmp_lm_eval_dir(tmp_path: Path) -> Path:
+    """Pytest fixture wrapper for create_tmp_lm_eval_dir"""
+    return create_tmp_lm_eval_dir(tmp_path)
+
+
+def test_transform_from_directory(tmp_lm_eval_dir: Path):
+    adapter = LMEvalAdapter()
+    results = adapter.transform_from_directory(tmp_lm_eval_dir)
+
+    assert isinstance(results, list)
+    assert len(results) == 2
+    for r in results:
+        assert r.schema_version
+        assert r.model.model_info.name == "gpt2"
+        assert r.instance.raw_input.startswith("Q")
+        assert r.evaluation.score in {0.0, 1.0}
+
+
+def main():
+
+    tmp_dir = create_tmp_lm_eval_dir(Path("/tmp/test_lm_eval"))
+
+    try:
+        test_transform_from_directory(tmp_dir)
+    except Exception as e:
+        print(f"test_transform_from_directory: FAILED - {e}")
+        return False
+    
+    # Test on real output if available
+    real_output_dir = Path("test_outputs")
+    if real_output_dir.exists():
+        subdirs = [d for d in real_output_dir.iterdir() if d.is_dir()]
+        if subdirs:
+            try:
+                adapter = LMEvalAdapter()
+                results = list(adapter.transform_from_directory(subdirs[0]))
+                
+                if results:
+                    sample = results[0]
+                    print(f"      Model: {sample.model.model_info.name}")
+                    print(f"      Family: {sample.model.model_info.family}")
+                    print(f"      Score: {sample.evaluation.score}")
+                    print(f"      Method: {sample.evaluation.evaluation_method.method_name}")
+            except Exception as e:
+                print(f"Real output test failed: {e}")
+        else:
+            print("No output subdirectories found")
+    else:
+        print("No real output found (run LMEvalRunner first)")
+    
+    return True
+
+if __name__ == "__main__":
+    import sys
+    success = main()
+    sys.exit(0 if success else 1)
\ No newline at end of file
diff --git a/tests/test_lm_eval_converter.py b/tests/test_lm_eval_converter.py
new file mode 100644
index 0000000..867501a
--- /dev/null
+++ b/tests/test_lm_eval_converter.py
@@ -0,0 +1,340 @@
+import pytest
+from pathlib import Path
+import tempfile
+import yaml
+import json
+from unittest.mock import patch, MagicMock
+
+from eval_converters.lm_eval.converter import LMEvalRunner
+
+
+@pytest.fixture
+def test_config():
+    """Create a test configuration"""
+    return {
+        "model": "hf",
+        "model_args": "pretrained=gpt2,dtype=float32",
+        "tasks": ["hellaswag"],
+        "batch_size": 2,
+        "num_fewshot": 0,
+        "output_dir": "test_outputs",
+        "limit": 5,
+        "device": "cpu",
+        "seed": 42
+    }
+
+
+@pytest.fixture
+def config_file(tmp_path, test_config):
+    """Create a temporary config file"""
+    config_path = tmp_path / "test_config.yaml"
+    with open(config_path, "w") as f:
+        yaml.dump(test_config, f)
+    return config_path
+
+
+def test_lm_eval_runner_init(config_file, test_config):
+    """Test LMEvalRunner initialization"""
+    runner = LMEvalRunner(config_file)
+    
+    assert runner.config_path == Path(config_file)
+    assert runner.cfg == test_config
+    assert runner.output_dir.name == "test_outputs"
+    assert runner.output_dir.exists()
+
+
+def test_build_cli(config_file):
+    """Test CLI command building"""
+    runner = LMEvalRunner(config_file)
+    cli = runner._build_cli()
+    
+    # Check basic structure
+    assert cli[0] == "lm-eval"
+    
+    # Check model args
+    assert "--model" in cli
+    assert "hf" in cli
+    assert "--model_args" in cli
+    assert "pretrained=gpt2,dtype=float32" in cli
+    
+    # Check tasks
+    assert "--tasks" in cli
+    assert "hellaswag" in cli
+    
+    # Check other parameters
+    assert "--batch_size" in cli
+    assert "2" in cli
+    assert "--device" in cli
+    assert "cpu" in cli
+    assert "--log_samples" in cli
+    assert "--num_fewshot" in cli
+    assert "0" in cli
+    assert "--limit" in cli
+    assert "5" in cli
+    assert "--seed" in cli
+    assert "42" in cli
+
+
+def test_build_cli_with_dict_model_args(tmp_path):
+    """Test CLI building when model_args is a dictionary"""
+    config = {
+        "model": "hf",
+        "model_args": {
+            "pretrained": "gpt2",
+            "dtype": "float16",
+            "trust_remote_code": True
+        },
+        "tasks": ["piqa"],
+        "batch_size": 4,
+        "output_dir": str(tmp_path / "outputs")
+    }
+    
+    config_path = tmp_path / "dict_config.yaml"
+    with open(config_path, "w") as f:
+        yaml.dump(config, f)
+    
+    runner = LMEvalRunner(config_path)
+    cli = runner._build_cli()
+    
+    # Find model_args in CLI
+    model_args_idx = cli.index("--model_args") + 1
+    model_args_str = cli[model_args_idx]
+    
+    # Check that dictionary was converted to comma-separated string
+    assert "pretrained=gpt2" in model_args_str
+    assert "dtype=float16" in model_args_str
+    assert "trust_remote_code=True" in model_args_str
+
+
+def test_build_cli_optional_params(tmp_path):
+    """Test CLI building with optional parameters"""
+    config = {
+        "model": "hf",
+        "model_args": "pretrained=gpt2",
+        "tasks": ["arc_easy"],
+        "batch_size": 1,
+        "output_dir": str(tmp_path / "outputs"),
+        "temperature": 0.7,
+        "apply_chat_template": True
+    }
+    
+    config_path = tmp_path / "optional_config.yaml"
+    with open(config_path, "w") as f:
+        yaml.dump(config, f)
+    
+    runner = LMEvalRunner(config_path)
+    cli = runner._build_cli()
+    
+    # Check temperature is added
+    assert "--gen_kwargs" in cli
+    temp_idx = cli.index("--gen_kwargs") + 1
+    assert cli[temp_idx] == "temperature=0.7"
+    
+    # Check chat template flag
+    assert "--apply_chat_template" in cli
+
+
+@patch('subprocess.run')
+def test_run_success(mock_run, config_file):
+    """Test successful run"""
+    mock_process = MagicMock()
+    mock_process.returncode = 0
+    mock_process.stdout = "Evaluation completed successfully"
+    mock_run.return_value = mock_process
+    
+    runner = LMEvalRunner(config_file)
+    runner.run()
+    
+    # Check subprocess was called
+    mock_run.assert_called_once()
+    call_args = mock_run.call_args[0][0]
+    
+    # Verify it's running lm-eval
+    assert "lm-eval" in call_args
+
+
+@patch('subprocess.run')
+def test_run_failure(mock_run, config_file):
+    """Test failed run"""
+    mock_process = MagicMock()
+    mock_process.returncode = 1
+    mock_process.stdout = "Error: Model not found"
+    mock_run.return_value = mock_process
+    
+    runner = LMEvalRunner(config_file)
+    
+    with pytest.raises(RuntimeError) as exc_info:
+        runner.run()
+    
+    assert "LMEval failed with exit code 1" in str(exc_info.value)
+    assert "Model not found" in str(exc_info.value)
+
+
+def test_output_dir_creation(tmp_path):
+    """Test that output directory is created properly"""
+    output_dir = tmp_path / "custom_outputs"
+    config = {
+        "model": "hf",
+        "model_args": "pretrained=gpt2",
+        "tasks": ["boolq"],
+        "batch_size": 1,
+        "output_dir": str(output_dir)
+    }
+    
+    config_path = tmp_path / "output_config.yaml"
+    with open(config_path, "w") as f:
+        yaml.dump(config, f)
+    
+    runner = LMEvalRunner(config_path)
+    
+    assert output_dir.exists()
+    assert output_dir.is_dir()
+
+
+def test_real_evaluation_with_test_config():
+    """Test running a real evaluation using the test config"""
+    from pathlib import Path
+    import json
+    
+    # Use the existing test config
+    config_path = Path("config/lm_eval_test_config.yaml")
+    
+    print(f"\nRunning real evaluation with {config_path}")
+    
+    try:
+        runner = LMEvalRunner(config_path)
+        
+        # Clear test_outputs directory first
+        import shutil
+        if runner.output_dir.exists():
+            shutil.rmtree(runner.output_dir)
+        
+        print(f"Output directory: {runner.output_dir}")
+        print("Running evaluation...")
+        
+        runner.run()
+        
+        print("Evaluation completed!")
+        
+        # Check that output files were created
+        assert runner.output_dir.exists(), "Output directory should exist"
+        
+        # Check for key output files
+        config_file = runner.output_dir / "config.yaml"
+        results_file = runner.output_dir / "results.json"
+        
+        # At least one of these should exist
+        pred_files = list(runner.output_dir.glob("*predictions*.jsonl")) + \
+                    list(runner.output_dir.glob("*samples*.jsonl"))
+        
+        print(f"Files created:")
+        for file_path in runner.output_dir.iterdir():
+            if file_path.is_file():
+                print(f"  - {file_path.name} ({file_path.stat().st_size} bytes)")
+        
+        # Basic assertions
+        assert config_file.exists() or results_file.exists(), "Should have config.yaml or results.json"
+        
+        # Test the adapter if we have prediction files
+        if pred_files:
+            print("Testing adapter transformation...")
+            from eval_converters.lm_eval.adapter import LMEvalAdapter
+            adapter = LMEvalAdapter()
+            results = list(adapter.transform_from_directory(runner.output_dir))
+            
+            print(f"Adapter processed {len(results)} evaluation results")
+            
+            if results:
+                sample = results[0]
+                assert sample.schema_version
+                assert sample.model.model_info.name
+                assert sample.evaluation.score is not None
+                print(f"Sample model: {sample.model.model_info.name}")
+                print(f"Sample score: {sample.evaluation.score}")
+        
+        return True
+        
+    except Exception as e:
+        print(f"Error during evaluation: {type(e).__name__}: {e}")
+        # Don't fail the test if lm-eval isn't properly installed or there are environment issues
+        pytest.skip(f"Skipping real evaluation test due to: {e}")
+        return False
+
+
+def main():
+    # Test 1: Configuration loading and CLI building
+    try:
+        from pathlib import Path
+        config_path = Path("config/lm_eval_test_config.yaml")
+        
+        if not config_path.exists():
+            print(f"Config file not found: {config_path}")
+            return False
+            
+        runner = LMEvalRunner(config_path)
+        print(f"Config loaded: {runner.cfg.get('model')} model")
+        print(f"Tasks: {runner.cfg.get('tasks')}")
+        print(f"Output dir: {runner.output_dir}")
+        
+        # Test CLI building
+        cli = runner._build_cli()
+        print(f"CLI built: {' '.join(cli[:5])}...")
+        
+    except Exception as e:
+        print(f"Configuration test failed: {e}")
+        return False
+
+    # Test 2: Real evaluation
+    try:
+        # Clear test_outputs first
+        import shutil
+        if runner.output_dir.exists():
+            shutil.rmtree(runner.output_dir)
+        
+        runner.run()
+        
+        # Check output files
+        if runner.output_dir.exists():
+            for file_path in runner.output_dir.rglob("*"):
+                if file_path.is_file():
+                    size_kb = file_path.stat().st_size / 1024
+                    print(f"{file_path.relative_to(runner.output_dir)} ({size_kb:.1f}KB)")
+        
+    except Exception as e:
+        print(f"Real evaluation failed (this is OK for testing): {e}")
+    
+    # Test 3: Adapter integration
+    try:
+        from eval_converters.lm_eval.adapter import LMEvalAdapter
+        adapter = LMEvalAdapter()
+        
+        # Find output directories
+        output_dirs = []
+        if runner.output_dir.exists():
+            output_dirs = [d for d in runner.output_dir.rglob("*") if d.is_dir() and any(d.glob("*.json*"))]
+        
+        if output_dirs:
+            test_dir = output_dirs[0]
+            
+            results = list(adapter.transform_from_directory(test_dir))
+            
+            if results:
+                sample = results[0]
+                print(f"      Model: {sample.model.model_info.name}")
+                print(f"      Family: {sample.model.model_info.family}")
+                print(f"      Score: {sample.evaluation.score}")
+                print(f"      Method: {sample.evaluation.evaluation_method.method_name}")
+        else:
+            print("No output directories found for adapter testing")
+            
+    except Exception as e:
+        print(f"Adapter integration test failed: {e}")
+        return False
+    
+    return True
+
+
+if __name__ == "__main__":
+    import sys
+    success = main()
+    sys.exit(0 if success else 1)
\ No newline at end of file