From 8a331f27c53bd87627fb25c692aefda1642345ff Mon Sep 17 00:00:00 2001 From: fisa712 <101712610+fisa712@users.noreply.github.com> Date: Fri, 24 Feb 2023 17:19:09 +0500 Subject: [PATCH 1/5] Create preprocessing.ipynb --- ch08/additional_features/preprocessing.ipynb | 28 ++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 ch08/additional_features/preprocessing.ipynb diff --git a/ch08/additional_features/preprocessing.ipynb b/ch08/additional_features/preprocessing.ipynb new file mode 100644 index 00000000..c3f2576e --- /dev/null +++ b/ch08/additional_features/preprocessing.ipynb @@ -0,0 +1,28 @@ +import nltk +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +from nltk.stem import WordNetLemmatizer, PorterStemmer + +def remove_stopwords(text): + stop_words = set(stopwords.words('english')) + word_tokens = word_tokenize(text) + filtered_text = [word for word in word_tokens if word.lower() not in stop_words] + return ' '.join(filtered_text) + +def perform_lemmatization(text): + lemmatizer = WordNetLemmatizer() + word_tokens = word_tokenize(text) + lemmatized_text = [lemmatizer.lemmatize(word) for word in word_tokens] + return ' '.join(lemmatized_text) + +def perform_stemming(text): + stemmer = PorterStemmer() + word_tokens = word_tokenize(text) + stemmed_text = [stemmer.stem(word) for word in word_tokens] + return ' '.join(stemmed_text) + +def preprocess_text(text): + text = remove_stopwords(text) + text = perform_lemmatization(text) + text = perform_stemming(text) + return text From 8968c20945eadc41687fa40a31fda0da562fd0d0 Mon Sep 17 00:00:00 2001 From: fisa712 <101712610+fisa712@users.noreply.github.com> Date: Fri, 24 Feb 2023 17:20:39 +0500 Subject: [PATCH 2/5] Create README.md --- ch08/additional_features/README.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 ch08/additional_features/README.md diff --git a/ch08/additional_features/README.md b/ch08/additional_features/README.md new file mode 100644 index 00000000..ea70841a --- /dev/null +++ b/ch08/additional_features/README.md @@ -0,0 +1,22 @@ +Task: Add more data preprocessing steps +In this task, we will explore the impact of adding more data preprocessing steps on the accuracy and generalization of our sentiment analysis model. Specifically, we will add stemming, lemmatization, and/or stop-word removal to the existing data preprocessing steps. + +Files and Folders +sentiment_analysis.py: This is the main script that performs sentiment analysis on a given input text. + +preprocessing.py: This script contains the existing data preprocessing steps. You will modify this script to add more preprocessing steps. + +data: This folder contains the training and test data. + +Instructions +Clone the repository and create a new branch for this task. + +Open the preprocessing.py script and add more data preprocessing steps such as stemming, lemmatization, or stop-word removal. You can use any NLP library such as NLTK or spaCy to implement these preprocessing steps. + +Train the model using the modified data preprocessing steps and evaluate its accuracy and generalization using the test data. + +Update the README file with the results of the evaluation and a description of the added preprocessing steps. + +Push the changes to the branch and create a pull request. + +Wait for the reviewer to approve the pull request and merge it with the main branch. From 47b11b39d9dac8cbe2e3fd3660894062e1b87509 Mon Sep 17 00:00:00 2001 From: fisa712 <101712610+fisa712@users.noreply.github.com> Date: Fri, 24 Feb 2023 18:05:43 +0500 Subject: [PATCH 3/5] Create test_preprocessing.py --- .../additional_features/test_preprocessing.py | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 ch08/additional_features/test_preprocessing.py diff --git a/ch08/additional_features/test_preprocessing.py b/ch08/additional_features/test_preprocessing.py new file mode 100644 index 00000000..bfaf844c --- /dev/null +++ b/ch08/additional_features/test_preprocessing.py @@ -0,0 +1,27 @@ +import unittest +from preprocessing import * + +class TestPreprocessing(unittest.TestCase): + + def test_remove_stopwords(self): + text = "this is a sample text that includes some stop words such as the, and, etc." + expected_output = "sample text includes stop words like , , etc ." + self.assertEqual(remove_stopwords(text), expected_output) + + def test_perform_lemmatization(self): + text = "running played plays" + expected_output = "running played play" + self.assertEqual(perform_lemmatization(text), expected_output) + + def test_perform_stemming(self): + text = "running played plays" + expected_output = "run play play" + self.assertEqual(perform_stemming(text), expected_output) + + def test_preprocess_text(self): + text = "This is a sample text. It includes some stop words, and it has words in different tenses (e.g. playing, played)." + expected_output = "thi sampl text . includ stop word , word differ tens ( e.g. play , play ) ." + self.assertEqual(preprocess_text(text), expected_output) + +if __name__ == '__main__': + unittest.main() From c9e7ae4f88f99f2be87f0964c08ea90a68be733a Mon Sep 17 00:00:00 2001 From: fisa712 <101712610+fisa712@users.noreply.github.com> Date: Fri, 24 Feb 2023 18:14:20 +0500 Subject: [PATCH 4/5] Create test_preprocessing.py --- .../.github/workflows/test_preprocessing.py | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 ch08/additional_features/.github/workflows/test_preprocessing.py diff --git a/ch08/additional_features/.github/workflows/test_preprocessing.py b/ch08/additional_features/.github/workflows/test_preprocessing.py new file mode 100644 index 00000000..bfaf844c --- /dev/null +++ b/ch08/additional_features/.github/workflows/test_preprocessing.py @@ -0,0 +1,27 @@ +import unittest +from preprocessing import * + +class TestPreprocessing(unittest.TestCase): + + def test_remove_stopwords(self): + text = "this is a sample text that includes some stop words such as the, and, etc." + expected_output = "sample text includes stop words like , , etc ." + self.assertEqual(remove_stopwords(text), expected_output) + + def test_perform_lemmatization(self): + text = "running played plays" + expected_output = "running played play" + self.assertEqual(perform_lemmatization(text), expected_output) + + def test_perform_stemming(self): + text = "running played plays" + expected_output = "run play play" + self.assertEqual(perform_stemming(text), expected_output) + + def test_preprocess_text(self): + text = "This is a sample text. It includes some stop words, and it has words in different tenses (e.g. playing, played)." + expected_output = "thi sampl text . includ stop word , word differ tens ( e.g. play , play ) ." + self.assertEqual(preprocess_text(text), expected_output) + +if __name__ == '__main__': + unittest.main() From 3f3180b9723aae8c4f5c69a546971801002b8ff5 Mon Sep 17 00:00:00 2001 From: fisa712 <101712610+fisa712@users.noreply.github.com> Date: Fri, 24 Feb 2023 18:17:19 +0500 Subject: [PATCH 5/5] Update and rename test_preprocessing.py to i191855_update.yml --- .../.github/workflows/i191855_update.yml | 17 ++++++++++++ .../.github/workflows/test_preprocessing.py | 27 ------------------- 2 files changed, 17 insertions(+), 27 deletions(-) create mode 100644 ch08/additional_features/.github/workflows/i191855_update.yml delete mode 100644 ch08/additional_features/.github/workflows/test_preprocessing.py diff --git a/ch08/additional_features/.github/workflows/i191855_update.yml b/ch08/additional_features/.github/workflows/i191855_update.yml new file mode 100644 index 00000000..153592f9 --- /dev/null +++ b/ch08/additional_features/.github/workflows/i191855_update.yml @@ -0,0 +1,17 @@ +name : push events workflow + +on : push + +jobs : + unit-testing : + runs-on: ubuntu-latest + + steps : + - name : Checkout Code + uses : actions/checkout@v2 + + - name : Install Package + run : pip install pytest numpy pandas + + - name : Run Test + run : pytest test_preprocessing.py diff --git a/ch08/additional_features/.github/workflows/test_preprocessing.py b/ch08/additional_features/.github/workflows/test_preprocessing.py deleted file mode 100644 index bfaf844c..00000000 --- a/ch08/additional_features/.github/workflows/test_preprocessing.py +++ /dev/null @@ -1,27 +0,0 @@ -import unittest -from preprocessing import * - -class TestPreprocessing(unittest.TestCase): - - def test_remove_stopwords(self): - text = "this is a sample text that includes some stop words such as the, and, etc." - expected_output = "sample text includes stop words like , , etc ." - self.assertEqual(remove_stopwords(text), expected_output) - - def test_perform_lemmatization(self): - text = "running played plays" - expected_output = "running played play" - self.assertEqual(perform_lemmatization(text), expected_output) - - def test_perform_stemming(self): - text = "running played plays" - expected_output = "run play play" - self.assertEqual(perform_stemming(text), expected_output) - - def test_preprocess_text(self): - text = "This is a sample text. It includes some stop words, and it has words in different tenses (e.g. playing, played)." - expected_output = "thi sampl text . includ stop word , word differ tens ( e.g. play , play ) ." - self.assertEqual(preprocess_text(text), expected_output) - -if __name__ == '__main__': - unittest.main()