Skip to content

Commit 4774806

Browse files
committed
fix(initial setup): dockerfile corrected for encountered corrupted nltk downloads
1 parent 3ea02b2 commit 4774806

File tree

3 files changed

+40
-25
lines changed

3 files changed

+40
-25
lines changed

app/Dockerfile

Lines changed: 28 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -15,34 +15,38 @@ COPY requirements.txt .
1515
COPY brown_length .
1616
COPY word_freqs .
1717
COPY w2v .
18-
# RUN apt-get update && apt-get install -y wget unzip
18+
RUN cat /etc/os-release
19+
RUN yum install -y wget unzip
1920
RUN pip3 install -r requirements.txt
2021

21-
# # Download NLTK data files
22-
# RUN wget -O /usr/share/nltk_data/corpora/wordnet.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/wordnet.zip
23-
# RUN wget -O /usr/share/nltk_data/models/word2vec_sample.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/models/word2vec_sample.zip
24-
# RUN wget -O /usr/share/nltk_data/corpora/brown.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/brown.zip
25-
# RUN wget -O /usr/share/nltk_data/corpora/stopwords.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/stopwords.zip
26-
# RUN wget -O /usr/share/nltk_data/tokenizers/punkt.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip
27-
28-
# # Unzip the downloaded files into the correct subfolders
29-
# RUN unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/
30-
# RUN unzip /usr/share/nltk_data/models/word2vec_sample.zip -d /usr/share/nltk_data/models/
31-
# RUN unzip /usr/share/nltk_data/corpora/brown.zip -d /usr/share/nltk_data/corpora/
32-
# RUN unzip /usr/share/nltk_data/corpora/stopwords.zip -d /usr/share/nltk_data/corpora/
33-
# RUN unzip /usr/share/nltk_data/tokenizers/punkt.zip -d /usr/share/nltk_data/tokenizers/
34-
35-
# # Clean up zip files to reduce image size
36-
# RUN rm /usr/share/nltk_data/corpora/*.zip
37-
# RUN rm /usr/share/nltk_data/models/*.zip
38-
# RUN rm /usr/share/nltk_data/tokenizers/*.zip
22+
# Download NLTK data files
23+
RUN wget -O /usr/share/nltk_data/corpora/wordnet.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/wordnet.zip
24+
RUN wget -O /usr/share/nltk_data/models/word2vec_sample.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/models/word2vec_sample.zip
25+
RUN wget -O /usr/share/nltk_data/corpora/brown.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/brown.zip
26+
RUN wget -O /usr/share/nltk_data/corpora/stopwords.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/stopwords.zip
27+
RUN wget -O /usr/share/nltk_data/tokenizers/punkt.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip
28+
RUN wget -O /usr/share/nltk_data/tokenizers/punkt_tab.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt_tab.zip
29+
30+
# Unzip the downloaded files into the correct subfolders
31+
RUN unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/
32+
RUN unzip /usr/share/nltk_data/models/word2vec_sample.zip -d /usr/share/nltk_data/models/
33+
RUN unzip /usr/share/nltk_data/corpora/brown.zip -d /usr/share/nltk_data/corpora/
34+
RUN unzip /usr/share/nltk_data/corpora/stopwords.zip -d /usr/share/nltk_data/corpora/
35+
RUN unzip /usr/share/nltk_data/tokenizers/punkt.zip -d /usr/share/nltk_data/tokenizers/
36+
RUN unzip /usr/share/nltk_data/tokenizers/punkt_tab.zip -d /usr/share/nltk_data/tokenizers/
37+
38+
# Clean up zip files to reduce image size
39+
RUN rm /usr/share/nltk_data/corpora/*.zip
40+
RUN rm /usr/share/nltk_data/models/*.zip
41+
RUN rm /usr/share/nltk_data/tokenizers/*.zip
3942

4043
# Warnings: those commands sometimes download corrupted zips, so it is better to wget each package from the main site
41-
RUN python -m nltk.downloader wordnet
42-
RUN python -m nltk.downloader word2vec_sample
43-
RUN python -m nltk.downloader brown
44-
RUN python -m nltk.downloader stopwords
45-
RUN python -m nltk.downloader punkt
44+
# RUN python -m nltk.downloader wordnet
45+
# RUN python -m nltk.downloader word2vec_sample
46+
# RUN python -m nltk.downloader brown
47+
# RUN python -m nltk.downloader stopwords
48+
# RUN python -m nltk.downloader punkt
49+
# RUN python -m nltk.downloader punkt_tab
4650

4751
# Copy the evaluation and testing scripts
4852
COPY brown_length ./app/

app/evaluation_tests.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,5 +133,16 @@ def test_navier_stokes_equation(self):
133133
result = evaluation_function(response, answer, params)
134134
self.assertEqual(result.get("is_correct"), True, msg=f'Response: {response}')
135135

136+
def test_negation(self):
137+
answer, params = 'not light blue', dict()
138+
correct_responses = [
139+
'bright blue',
140+
'light blue'
141+
]
142+
143+
for response in correct_responses:
144+
result = evaluation_function(response, answer, params)
145+
self.assertEqual(result.get("is_correct"), True, msg=f'Response: {response}')
146+
136147
if __name__ == "__main__":
137148
unittest.main()

app/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
numpy
2-
nltk
2+
nltk==3.8.1
33
gensim
44
matplotlib
55

0 commit comments

Comments
 (0)