From c7de246ea94bbdec0c2fc47c200976d459aad6b3 Mon Sep 17 00:00:00 2001 From: Calix Huang Date: Tue, 22 Oct 2019 13:43:41 -0700 Subject: [PATCH 1/2] Modifyed Rake object, cut down on the lines of code, got rid of getters and added a setter method --- rake_nltk/rake.py | 63 +++++++++++++++-------------------------------- 1 file changed, 20 insertions(+), 43 deletions(-) diff --git a/rake_nltk/rake.py b/rake_nltk/rake.py index 53f5b9d..6252e6f 100644 --- a/rake_nltk/rake.py +++ b/rake_nltk/rake.py @@ -27,6 +27,7 @@ class Rake(object): def __init__( self, + text, stopwords=None, punctuations=None, language="english", @@ -44,6 +45,7 @@ def __init__( :param min_length: Minimum limit on the number of words in a phrase (Inclusive. Defaults to 1) """ + # By default use degree to frequency ratio as the metric. if isinstance(ranking_metric, Metric): self.metric = ranking_metric @@ -69,19 +71,26 @@ def __init__( # Stuff to be extracted from the provided text. self.frequency_dist = None - self.degree = None + self.word_degrees = None self.rank_list = None self.ranked_phrases = None - def extract_keywords_from_text(self, text): - """Method to extract keywords from the text provided. + # Initializing the text and building all the fields + self.set_text(text) - :param text: Text to extract keywords from, provided as a string. - """ + # You don't need all of the getter methods, you just need to call these fields off the Rake object + # Fields to call: + # - self.ranked_phrases + # - self.rank_list + # - self.frequency_dist + # - self.word_degrees + + def set_text(self, text): + self.text = text sentences = nltk.tokenize.sent_tokenize(text) - self.extract_keywords_from_sentences(sentences) + self._extract_keywords_from_sentences(sentences) - def extract_keywords_from_sentences(self, sentences): + def _extract_keywords_from_sentences(self, sentences): """Method to extract keywords from the list of sentences provided. :param sentences: Text to extraxt keywords from, provided as a list @@ -92,38 +101,6 @@ def extract_keywords_from_sentences(self, sentences): self._build_word_co_occurance_graph(phrase_list) self._build_ranklist(phrase_list) - def get_ranked_phrases(self): - """Method to fetch ranked keyword strings. - - :return: List of strings where each string represents an extracted - keyword string. - """ - return self.ranked_phrases - - def get_ranked_phrases_with_scores(self): - """Method to fetch ranked keyword strings along with their scores. - - :return: List of tuples where each tuple is formed of an extracted - keyword string and its score. Ex: (5.68, 'Four Scoures') - """ - return self.rank_list - - def get_word_frequency_distribution(self): - """Method to fetch the word frequency distribution in the given text. - - :return: Dictionary (defaultdict) of the format `word -> frequency`. - """ - return self.frequency_dist - - def get_word_degrees(self): - """Method to fetch the degree of words in the given text. Degree can be - defined as sum of co-occurances of the word with other words in the - given text. - - :return: Dictionary (defaultdict) of the format `word -> degree`. - """ - return self.degree - def _build_frequency_dist(self, phrase_list): """Builds frequency distribution of the words in the given body of text. @@ -148,9 +125,9 @@ def _build_word_co_occurance_graph(self, phrase_list): # use in other creative ways if required later. for (word, coword) in product(phrase, phrase): co_occurance_graph[word][coword] += 1 - self.degree = defaultdict(lambda: 0) + self.word_degrees = defaultdict(lambda: 0) for key in co_occurance_graph: - self.degree[key] = sum(co_occurance_graph[key].values()) + self.word_degrees[key] = sum(co_occurance_graph[key].values()) def _build_ranklist(self, phrase_list): """Method to rank each contender phrase using the formula @@ -166,9 +143,9 @@ def _build_ranklist(self, phrase_list): rank = 0.0 for word in phrase: if self.metric == Metric.DEGREE_TO_FREQUENCY_RATIO: - rank += 1.0 * self.degree[word] / self.frequency_dist[word] + rank += 1.0 * self.word_degrees[word] / self.frequency_dist[word] elif self.metric == Metric.WORD_DEGREE: - rank += 1.0 * self.degree[word] + rank += 1.0 * self.word_degrees[word] else: rank += 1.0 * self.frequency_dist[word] self.rank_list.append((rank, " ".join(phrase))) From 1de315ef10b01a5b306f60432521802d734e6118 Mon Sep 17 00:00:00 2001 From: Calix Huang Date: Tue, 22 Oct 2019 13:59:21 -0700 Subject: [PATCH 2/2] Changed the type of class fields and added setters --- rake_nltk/rake.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/rake_nltk/rake.py b/rake_nltk/rake.py index 6252e6f..315413b 100644 --- a/rake_nltk/rake.py +++ b/rake_nltk/rake.py @@ -48,9 +48,9 @@ def __init__( # By default use degree to frequency ratio as the metric. if isinstance(ranking_metric, Metric): - self.metric = ranking_metric + self.__metric = ranking_metric else: - self.metric = Metric.DEGREE_TO_FREQUENCY_RATIO + self.__metric = Metric.DEGREE_TO_FREQUENCY_RATIO # If stopwords not provided we use language stopwords by default. self.stopwords = stopwords @@ -63,11 +63,10 @@ def __init__( self.punctuations = string.punctuation # All things which act as sentence breaks during keyword extraction. - self.to_ignore = set(chain(self.stopwords, self.punctuations)) + self.__to_ignore = set(chain(self.stopwords, self.punctuations)) - # Assign min or max length to the attributes - self.min_length = min_length - self.max_length = max_length + self.__min_length = min_length + self.__max_length = max_length # Stuff to be extracted from the provided text. self.frequency_dist = None @@ -90,6 +89,14 @@ def set_text(self, text): sentences = nltk.tokenize.sent_tokenize(text) self._extract_keywords_from_sentences(sentences) + def set_stopwords(self, stopwords): + self.stopwords = stopwords + self.set_text(self.text) + + def set_punctuations(self, punctuations): + self.punctuations = punctuations + self.set_text(self.text) + def _extract_keywords_from_sentences(self, sentences): """Method to extract keywords from the list of sentences provided. @@ -142,9 +149,9 @@ def _build_ranklist(self, phrase_list): for phrase in phrase_list: rank = 0.0 for word in phrase: - if self.metric == Metric.DEGREE_TO_FREQUENCY_RATIO: + if self.__metric == Metric.DEGREE_TO_FREQUENCY_RATIO: rank += 1.0 * self.word_degrees[word] / self.frequency_dist[word] - elif self.metric == Metric.WORD_DEGREE: + elif self.__metric == Metric.WORD_DEGREE: rank += 1.0 * self.word_degrees[word] else: rank += 1.0 * self.frequency_dist[word] @@ -190,10 +197,10 @@ def _get_phrase_list_from_words(self, word_list): :return: List of contender phrases that are formed after dropping stopwords and punctuations. """ - groups = groupby(word_list, lambda x: x not in self.to_ignore) + groups = groupby(word_list, lambda x: x not in self.__to_ignore) phrases = [tuple(group[1]) for group in groups if group[0]] return list( filter( - lambda x: self.min_length <= len(x) <= self.max_length, phrases + lambda x: self.__min_length <= len(x) <= self.__max_length, phrases ) )