Merge pull request #5 from lambda-feedback/slm

neagualexa · web-flow · commit f48c75f8db40 · 2024-09-11T09:26:25.000+01:00
tr154-Updated Dockerfile(for nltk corruption error)
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,6 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+# MacOS files
+.DS_Store
diff --git a/app/Dockerfile b/app/Dockerfile
@@ -5,6 +5,7 @@ FROM rabidsheep55/python-base-eval-layer
 WORKDIR /app
 
 RUN mkdir /usr/share/nltk_data
+RUN mkdir -p /usr/share/nltk_data/corpora /usr/share/nltk_data/models /usr/share/nltk_data/tokenizers
 
 ARG NLTK_DATA=/usr/share/nltk_data
 
@@ -14,12 +15,37 @@ COPY requirements.txt .
 COPY brown_length .
 COPY word_freqs .
 COPY w2v .
+RUN yum install -y wget unzip
 RUN pip3 install -r requirements.txt
-RUN python -m nltk.downloader wordnet
-RUN python -m nltk.downloader word2vec_sample
-RUN python -m nltk.downloader brown
-RUN python -m nltk.downloader stopwords
-RUN python -m nltk.downloader punkt
+
+# Download NLTK data files
+RUN wget -O /usr/share/nltk_data/corpora/wordnet.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/wordnet.zip
+RUN wget -O /usr/share/nltk_data/models/word2vec_sample.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/models/word2vec_sample.zip
+RUN wget -O /usr/share/nltk_data/corpora/brown.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/brown.zip
+RUN wget -O /usr/share/nltk_data/corpora/stopwords.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/stopwords.zip
+RUN wget -O /usr/share/nltk_data/tokenizers/punkt.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip
+RUN wget -O /usr/share/nltk_data/tokenizers/punkt_tab.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt_tab.zip
+
+# Unzip the downloaded files into the correct subfolders corresponsing to NLTK requirements
+RUN unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/
+RUN unzip /usr/share/nltk_data/models/word2vec_sample.zip -d /usr/share/nltk_data/models/
+RUN unzip /usr/share/nltk_data/corpora/brown.zip -d /usr/share/nltk_data/corpora/
+RUN unzip /usr/share/nltk_data/corpora/stopwords.zip -d /usr/share/nltk_data/corpora/
+RUN unzip /usr/share/nltk_data/tokenizers/punkt.zip -d /usr/share/nltk_data/tokenizers/
+RUN unzip /usr/share/nltk_data/tokenizers/punkt_tab.zip -d /usr/share/nltk_data/tokenizers/
+
+# Clean up zip files to reduce image size
+RUN rm /usr/share/nltk_data/corpora/*.zip
+RUN rm /usr/share/nltk_data/models/*.zip
+RUN rm /usr/share/nltk_data/tokenizers/*.zip
+
+# Warnings: those commands sometimes download corrupted zips, so it is better to wget each package from the main site
+# RUN python -m nltk.downloader wordnet
+# RUN python -m nltk.downloader word2vec_sample
+# RUN python -m nltk.downloader brown
+# RUN python -m nltk.downloader stopwords
+# RUN python -m nltk.downloader punkt
+# RUN python -m nltk.downloader punkt_tab
 
 # Copy the evaluation and testing scripts
 COPY brown_length ./app/
diff --git a/app/docs/dev.md b/app/docs/dev.md
@@ -37,6 +37,15 @@ Otherwise, it will have the additional fields:
 
 If the method is w2v, it means the two texts were found to be similar. Otherwise, a BOW vector similarity check is performed in order to identify the most likely word that caused the texts to be found dissimilar.
 
+## Initial SetUp
+Follow Docker Image instructions and run 
+`docker build -t <image_name> .` in app/
+
+Otherwise if setup locally:
+1. create a venv
+2. in the venv `pip install -r app/requirements.txt`
+3. if errors encountered with nltk packages, follow `testing_nltk.py` instructions
+
 ## Examples
 *List of example inputs and outputs for this function, each under a different sub-heading*
 
diff --git a/app/evaluation_tests.py b/app/evaluation_tests.py
@@ -133,5 +133,16 @@ def test_navier_stokes_equation(self):
             result = evaluation_function(response, answer, params)
             self.assertEqual(result.get("is_correct"), True, msg=f'Response: {response}')
 
+    def test_negation(self):
+        answer, params = 'not light blue', dict()
+        correct_responses = [
+            'bright blue',
+            'light blue'
+        ]
+
+        for response in correct_responses:
+            result = evaluation_function(response, answer, params)
+            self.assertEqual(result.get("is_correct"), True, msg=f'Response: {response}')
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/app/requirements.txt b/app/requirements.txt
@@ -1,4 +1,8 @@
 numpy
-nltk
+nltk==3.8.1
 gensim
-matplotlib
+matplotlib
+
+# To run on cli: /Applications/Python\ 3.11/Install\ Certificates.command
+# If SSL cert fail on Mac -> command above calls pip install --upgrade certifi  -> then calling nltk.download works
+certifi
diff --git a/app/testing_nltk.py b/app/testing_nltk.py
@@ -0,0 +1,5 @@
+import nltk
+print(nltk.data.path)
+nltk.download()
+# If zip packages cannot be unzipped or error from the downloader above, then download packages from online https://www.nltk.org/nltk_data/
+# Need to check with the command above where the zip packages should go in folder /Users/<username>/nltk_data/...