diff --git a/ch08/additional_features/.github/workflows/i191855_update.yml b/ch08/additional_features/.github/workflows/i191855_update.yml new file mode 100644 index 00000000..153592f9 --- /dev/null +++ b/ch08/additional_features/.github/workflows/i191855_update.yml @@ -0,0 +1,17 @@ +name : push events workflow + +on : push + +jobs : + unit-testing : + runs-on: ubuntu-latest + + steps : + - name : Checkout Code + uses : actions/checkout@v2 + + - name : Install Package + run : pip install pytest numpy pandas + + - name : Run Test + run : pytest test_preprocessing.py diff --git a/ch08/additional_features/README.md b/ch08/additional_features/README.md new file mode 100644 index 00000000..ea70841a --- /dev/null +++ b/ch08/additional_features/README.md @@ -0,0 +1,22 @@ +Task: Add more data preprocessing steps +In this task, we will explore the impact of adding more data preprocessing steps on the accuracy and generalization of our sentiment analysis model. Specifically, we will add stemming, lemmatization, and/or stop-word removal to the existing data preprocessing steps. + +Files and Folders +sentiment_analysis.py: This is the main script that performs sentiment analysis on a given input text. + +preprocessing.py: This script contains the existing data preprocessing steps. You will modify this script to add more preprocessing steps. + +data: This folder contains the training and test data. + +Instructions +Clone the repository and create a new branch for this task. + +Open the preprocessing.py script and add more data preprocessing steps such as stemming, lemmatization, or stop-word removal. You can use any NLP library such as NLTK or spaCy to implement these preprocessing steps. + +Train the model using the modified data preprocessing steps and evaluate its accuracy and generalization using the test data. + +Update the README file with the results of the evaluation and a description of the added preprocessing steps. + +Push the changes to the branch and create a pull request. + +Wait for the reviewer to approve the pull request and merge it with the main branch. diff --git a/ch08/additional_features/preprocessing.ipynb b/ch08/additional_features/preprocessing.ipynb new file mode 100644 index 00000000..c3f2576e --- /dev/null +++ b/ch08/additional_features/preprocessing.ipynb @@ -0,0 +1,28 @@ +import nltk +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +from nltk.stem import WordNetLemmatizer, PorterStemmer + +def remove_stopwords(text): + stop_words = set(stopwords.words('english')) + word_tokens = word_tokenize(text) + filtered_text = [word for word in word_tokens if word.lower() not in stop_words] + return ' '.join(filtered_text) + +def perform_lemmatization(text): + lemmatizer = WordNetLemmatizer() + word_tokens = word_tokenize(text) + lemmatized_text = [lemmatizer.lemmatize(word) for word in word_tokens] + return ' '.join(lemmatized_text) + +def perform_stemming(text): + stemmer = PorterStemmer() + word_tokens = word_tokenize(text) + stemmed_text = [stemmer.stem(word) for word in word_tokens] + return ' '.join(stemmed_text) + +def preprocess_text(text): + text = remove_stopwords(text) + text = perform_lemmatization(text) + text = perform_stemming(text) + return text diff --git a/ch08/additional_features/test_preprocessing.py b/ch08/additional_features/test_preprocessing.py new file mode 100644 index 00000000..bfaf844c --- /dev/null +++ b/ch08/additional_features/test_preprocessing.py @@ -0,0 +1,27 @@ +import unittest +from preprocessing import * + +class TestPreprocessing(unittest.TestCase): + + def test_remove_stopwords(self): + text = "this is a sample text that includes some stop words such as the, and, etc." + expected_output = "sample text includes stop words like , , etc ." + self.assertEqual(remove_stopwords(text), expected_output) + + def test_perform_lemmatization(self): + text = "running played plays" + expected_output = "running played play" + self.assertEqual(perform_lemmatization(text), expected_output) + + def test_perform_stemming(self): + text = "running played plays" + expected_output = "run play play" + self.assertEqual(perform_stemming(text), expected_output) + + def test_preprocess_text(self): + text = "This is a sample text. It includes some stop words, and it has words in different tenses (e.g. playing, played)." + expected_output = "thi sampl text . includ stop word , word differ tens ( e.g. play , play ) ." + self.assertEqual(preprocess_text(text), expected_output) + +if __name__ == '__main__': + unittest.main()