fix(initial setup): dockerfile corrected for encountered corrupted nltk downloads

neagualexa · neagualexa · commit 477480650630 · 2024-09-10T09:41:03.000+01:00
diff --git a/app/Dockerfile b/app/Dockerfile
@@ -15,34 +15,38 @@ COPY requirements.txt .
 COPY brown_length .
 COPY word_freqs .
 COPY w2v .
-# RUN apt-get update && apt-get install -y wget unzip
+RUN cat /etc/os-release
+RUN yum install -y wget unzip
 RUN pip3 install -r requirements.txt
 
-# # Download NLTK data files
-# RUN wget -O /usr/share/nltk_data/corpora/wordnet.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/wordnet.zip
-# RUN wget -O /usr/share/nltk_data/models/word2vec_sample.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/models/word2vec_sample.zip
-# RUN wget -O /usr/share/nltk_data/corpora/brown.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/brown.zip
-# RUN wget -O /usr/share/nltk_data/corpora/stopwords.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/stopwords.zip
-# RUN wget -O /usr/share/nltk_data/tokenizers/punkt.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip
-
-# # Unzip the downloaded files into the correct subfolders
-# RUN unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/
-# RUN unzip /usr/share/nltk_data/models/word2vec_sample.zip -d /usr/share/nltk_data/models/
-# RUN unzip /usr/share/nltk_data/corpora/brown.zip -d /usr/share/nltk_data/corpora/
-# RUN unzip /usr/share/nltk_data/corpora/stopwords.zip -d /usr/share/nltk_data/corpora/
-# RUN unzip /usr/share/nltk_data/tokenizers/punkt.zip -d /usr/share/nltk_data/tokenizers/
-
-# # Clean up zip files to reduce image size
-# RUN rm /usr/share/nltk_data/corpora/*.zip
-# RUN rm /usr/share/nltk_data/models/*.zip
-# RUN rm /usr/share/nltk_data/tokenizers/*.zip
+# Download NLTK data files
+RUN wget -O /usr/share/nltk_data/corpora/wordnet.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/wordnet.zip
+RUN wget -O /usr/share/nltk_data/models/word2vec_sample.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/models/word2vec_sample.zip
+RUN wget -O /usr/share/nltk_data/corpora/brown.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/brown.zip
+RUN wget -O /usr/share/nltk_data/corpora/stopwords.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/stopwords.zip
+RUN wget -O /usr/share/nltk_data/tokenizers/punkt.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip
+RUN wget -O /usr/share/nltk_data/tokenizers/punkt_tab.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt_tab.zip
+
+# Unzip the downloaded files into the correct subfolders
+RUN unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/
+RUN unzip /usr/share/nltk_data/models/word2vec_sample.zip -d /usr/share/nltk_data/models/
+RUN unzip /usr/share/nltk_data/corpora/brown.zip -d /usr/share/nltk_data/corpora/
+RUN unzip /usr/share/nltk_data/corpora/stopwords.zip -d /usr/share/nltk_data/corpora/
+RUN unzip /usr/share/nltk_data/tokenizers/punkt.zip -d /usr/share/nltk_data/tokenizers/
+RUN unzip /usr/share/nltk_data/tokenizers/punkt_tab.zip -d /usr/share/nltk_data/tokenizers/
+
+# Clean up zip files to reduce image size
+RUN rm /usr/share/nltk_data/corpora/*.zip
+RUN rm /usr/share/nltk_data/models/*.zip
+RUN rm /usr/share/nltk_data/tokenizers/*.zip
 
 # Warnings: those commands sometimes download corrupted zips, so it is better to wget each package from the main site
-RUN python -m nltk.downloader wordnet
-RUN python -m nltk.downloader word2vec_sample
-RUN python -m nltk.downloader brown
-RUN python -m nltk.downloader stopwords
-RUN python -m nltk.downloader punkt
+# RUN python -m nltk.downloader wordnet
+# RUN python -m nltk.downloader word2vec_sample
+# RUN python -m nltk.downloader brown
+# RUN python -m nltk.downloader stopwords
+# RUN python -m nltk.downloader punkt
+# RUN python -m nltk.downloader punkt_tab
 
 # Copy the evaluation and testing scripts
 COPY brown_length ./app/
diff --git a/app/evaluation_tests.py b/app/evaluation_tests.py
@@ -133,5 +133,16 @@ def test_navier_stokes_equation(self):
             result = evaluation_function(response, answer, params)
             self.assertEqual(result.get("is_correct"), True, msg=f'Response: {response}')
 
+    def test_negation(self):
+        answer, params = 'not light blue', dict()
+        correct_responses = [
+            'bright blue',
+            'light blue'
+        ]
+
+        for response in correct_responses:
+            result = evaluation_function(response, answer, params)
+            self.assertEqual(result.get("is_correct"), True, msg=f'Response: {response}')
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/app/requirements.txt b/app/requirements.txt
@@ -1,5 +1,5 @@
 numpy
-nltk
+nltk==3.8.1
 gensim
 matplotlib
 

-Original file line number
+Diff line change
@@ @@ -1,5 +1,5 @@ @@
 numpy
 -nltk
 +nltk==3.8.1
 gensim
 matplotlib