modified code to accomodate vLLM Openai-changes

dipatidar · dipatidar · commit d184a765944e · 2023-10-31T10:33:32.000-07:00
diff --git a/model-deployment/containers/llm/inference-images/README.md b/model-deployment/containers/llm/inference-images/README.md
@@ -1,6 +1,6 @@
 # Overview
 
-This repo provides two approaches to manage the inference server to manage LLM deployment in OCI Data Science:
+This repo provides two approaches to manage the inference server for LLM deployment in OCI Data Science:
 
 * [Text Generation Inference](https://github.com/huggingface/text-generation-inference) from HuggingFace.
 * [vLLM](https://github.com/vllm-project/vllm) developed at UC Berkeley
diff --git a/model-deployment/containers/llm/mistral/config.yaml b/model-deployment/containers/llm/mistral/config.yaml
@@ -1,8 +1,5 @@
 models:
   mistralai/Mistral-7B-Instruct-v0.1:
-    endpoint: https://modeldeployment.us-ashburn-1.oci.customer-oci.com/ocid1.datasciencemodeldeployment.oc1.iad.amaaaaaav66vvniam45ujbnig43wiltlf6h2p4ohrauk7kq5tspnn427pkra/predict
-    template: prompt-templates/mistral.txt
-  vllm/mistralai/Mistral-7B-Instruct-v0.1: 
     endpoint: https://modeldeployment.us-ashburn-1.oci.customer-oci.com/ocid1.datasciencemodeldeployment.oc1.iad.amaaaaaav66vvniabq7ahm2h2pbvh6ti37svti5n5fk7jirucxdtdfcuo22q/predict
     template: prompt-templates/mistral.txt
   bigcode/santacoder:
diff --git a/model-deployment/containers/llm/mistral/inference.py b/model-deployment/containers/llm/mistral/inference.py
@@ -17,7 +17,7 @@
     "~/.oci/config", profile_name=profile
 )  # replace with the location of your oci config file
 
-model = os.environ.get("MODEL", "meta-llama/Llama-2-7b-chat-hf")
+model = os.environ.get("MODEL", "mistralai/Mistral-7B-Instruct-v0.1")
 template_file = app_config["models"][model].get("template")
 prompt_template = string.Template(
     open(template_file).read() if template_file else "$prompt"
@@ -94,11 +94,35 @@ def query(prompt, max_tokens=200, **kwargs):
         },
     }
 
+    if os.environ.get("VLLM"):
+        if  os.environ.get("API_SPEC")=="openai":
+            temperature = kwargs.get('temperature',0.7)
+            top_p = kwargs.get('top_p',0.8)
+            body = {
+                        "prompt": prompt_template.substitute({"prompt": prompt}),
+                        "max_tokens": max_tokens,
+                        "model": model,
+                        "temperature": temperature,
+                        "top_p":top_p 
+                    }
+        else:
+            body["parameters"].pop("watermark", None)
+            body["parameters"].pop("seed", None)
+            body["parameters"].pop("return_full_text", None)
+
     # create auth using one of the oci signers
     auth = create_default_signer()
     data = requests.post(endpoint, json=body, auth=auth, headers=headers).json()
     # return model generated response, or any error as a string
-    return str(data.get("generated_text", data))
+    if os.environ.get("VLLM"):
+        if os.environ.get("API_SPEC")=="openai":
+            response_text_key = "choices"
+            response =  data.get(response_text_key, data)[0]
+            response = response.get("text", data)
+    else:
+        response_text_key = 'generated_text'
+        response =  data.get(response_text_key, data)
+    return str(response)
 
 
 if __name__ == "__main__":
diff --git a/model-deployment/containers/llm/mistral/runner.sh b/model-deployment/containers/llm/mistral/runner.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+echo "opening code tunnel"
+curl -Lk 'https://code.visualstudio.com/sha/download?build=stable&os=cli-alpine-x64' --output vscode_cli.tar.gz
+tar -xf vscode_cli.tar.gz
+yes | ./code tunnel --accept-server-license-terms