-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtopicmodel.py
More file actions
142 lines (122 loc) · 5.4 KB
/
topicmodel.py
File metadata and controls
142 lines (122 loc) · 5.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import gensim
import nltk
from gensim.models.ldamodel import LdaModel
from preprocess import create_corpus, tokenizer, find_sentences
import json
import spacy
#https://features.propublica.org/climate-migration/model-how-climate-refugees-move-across-continents/
def topic_model(corpus: str, dic: dict, number_of_topics: int):
"""Returns a lda model based on the corpus and number of topics.
:param corpus: list of tuples of the form (token_id, token_count) ie. bag of words
:param dic: Dictionary mapping of words to its integer ids
:param number_of_topics: Number of topics or summaries that are needed to be created
:return: LDA model
"""
model = LdaModel(corpus=corpus, id2word=dic, num_topics=number_of_topics)
return model
def get_topic_keywords(model: gensim.models.ldamodel.LdaModel):
"""Returns the list of topic keywords along with its score for every topic
based on the number of topics.
:param model: LDA model
:return: Dictionary of topic number and its corresponding keywords
"""
num_topics = model.num_topics
topic_words_dic = {}
for i in range(num_topics):
topic_words_dic[i] = None
for k in topic_words_dic.keys():
topic_words_dic[k] = model.show_topic(k)
topic_words_dic = { "{key}".format(key = key) : str(value) for key, value in topic_words_dic.items()}
# topic_words_dic = [{'key': k, 'value': v} for k, v in topic_words_dic.items()]
# topic_dict = {'topics' : topic_words_dic}
topic_words_json = json.dumps( topic_words_dic)
return topic_words_json
# return topic_words_dic
def get_sentence_distribution(model: gensim.models.ldamodel.LdaModel, dic: dict, sentences: list):
"""Returns a list of topic number and corresponding score for every sentence
in the text.
:param model: LDA model
:param dic: Dictionary mapping of words to its integer ids
:param sentences: list of sentences in the text
:return:a list of tuples of the form (topic_number, score)
"""
distribution = []
for sentence in sentences:
#sent_tokens = tokenizer(sentence)
sent_tokens = nltk.word_tokenize(sentence)
bow = dic.doc2bow(sent_tokens)
#bow = dic.doc2bow(sentence)
dist = model.get_document_topics(bow)
dist = max(dist, key=lambda x:x[1])
distribution.append(dist)
return distribution
def group_sentences(model: gensim.models.ldamodel.LdaModel, dists: dict, sentences: list):
"""Returns groups of sentences for every topic based on the number of topics
If number of topics > 1, returns a dictionary where for every topic, there is
a corresponding list of tuples of the form (sentence_number, score) sorted
in a descending order
If number of topics = 1, returns a dictionary with 1 item which includes a
list of tuples of the form (sentence_number, score) sorted in a descending
order. The score is calculated by dividing the number of common keywords
by the total number of keywords.
:param model: LDA model
:param dists: a list of tuples of the form (topic_number, score)
:param sentences: a list of sentences in the original text
:return: Dictionary of topic number and sentences corresponding to the topic
"""
no_topics = model.num_topics
if no_topics != 1:
sentence_groups = {}
for i in range(no_topics):
sentence_groups[i] = []
for i in range(len(dists)):
key = dists[i][0]
val = (i, dists[i][1])
sentence_groups[key].append(val)
for k, v in sentence_groups.items():
v.sort(key=lambda x: x[1], reverse=True)
return sentence_groups
else:
scores = {}
sentence_groups = {}
keywords = [item[0] for item in model.show_topic(0)]
for i in range(len(sentences)):
common_keywords = len(set(sentences[i].split()) & set(keywords))
score = common_keywords/len(sentences[i].split())
scores[i] = score
scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
#print(len(scores))
#print(scores)
sentence_groups[0] = scores
return sentence_groups
def get_summaries(groups: dict, sentences: list):
"""Returns a string of text(summary) for every topic defined by number of topics
:param groups: Dictionary of topic number and sentences corresponding to the topic
:param sentences: List of sentences in the original text
:return: Dictionary of topic number and corresponding extracted summary
"""
summaries = {}
for k, v in groups.items():
no_of_sentences = len(v)
if no_of_sentences > 10:
v = v[:10]
summaries[k] = []
for sent_index in v:
summaries[k].append(sentences[sent_index[0]])
for k, v in summaries.items():
summaries[k] = ' '.join(v)
json_summaries = [{'topic': k, 'summary': v} for k, v in summaries.items()]
json_summaries = json.dumps(json_summaries)
return json_summaries
def find_entities(summary):
nlp = spacy.load('en_core_web_sm')
doc = nlp(summary)
entities = []
for i in doc.ents:
if i.label_ in ['EVENT', 'FAC', 'GPE', 'LOC', 'NORP', 'ORG', 'PERSON', 'WORK_OF_ART']:
entities.append((i.text,i.label_))
entites_dict = {}
for i in entities:
entites_dict[i[0]] = i[1]
entities_json = json.dumps( entites_dict)
return entities_json