article-summarization/topicmodel.py at master · gh0sty02/article-summarization · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import gensim
import nltk
from gensim.models.ldamodel import LdaModel
from preprocess import create_corpus, tokenizer, find_sentences
import json
import spacy

#https://features.propublica.org/climate-migration/model-how-climate-refugees-move-across-continents/


def topic_model(corpus: str, dic: dict, number_of_topics: int):
    """Returns a lda model based on the corpus and number of topics.

    :param corpus: list of tuples of the form (token_id, token_count) ie. bag of words
    :param dic: Dictionary mapping of words to its integer ids
    :param number_of_topics: Number of topics or summaries that are needed to be created
    :return: LDA model
    """
    model = LdaModel(corpus=corpus, id2word=dic, num_topics=number_of_topics)
    return model


def get_topic_keywords(model: gensim.models.ldamodel.LdaModel):
    """Returns the list of topic keywords along with its score for every topic
    based on the number of topics.

    :param model: LDA model
    :return: Dictionary of topic number and its corresponding keywords
    """
    num_topics = model.num_topics
    topic_words_dic = {}
    for i in range(num_topics):
        topic_words_dic[i] = None
    for k in topic_words_dic.keys():
        topic_words_dic[k] = model.show_topic(k)
    topic_words_dic = { "{key}".format(key = key) : str(value) for key, value in topic_words_dic.items()}
    # topic_words_dic = [{'key': k, 'value': v} for k, v in topic_words_dic.items()]
    # topic_dict = {'topics' : topic_words_dic}
    topic_words_json = json.dumps( topic_words_dic)
    return topic_words_json
    # return topic_words_dic


def get_sentence_distribution(model: gensim.models.ldamodel.LdaModel, dic: dict, sentences: list):
    """Returns a list of topic number and corresponding score for every sentence
    in the text.

    :param model: LDA model
    :param dic: Dictionary mapping of words to its integer ids
    :param sentences: list of sentences in the text
    :return:a list of tuples of the form (topic_number, score)
    """
    distribution = []
    for sentence in sentences:
        #sent_tokens = tokenizer(sentence)
        sent_tokens = nltk.word_tokenize(sentence)
        bow = dic.doc2bow(sent_tokens)
        #bow = dic.doc2bow(sentence)
        dist = model.get_document_topics(bow)
        dist = max(dist, key=lambda x:x[1])
        distribution.append(dist)
    return distribution


def group_sentences(model: gensim.models.ldamodel.LdaModel, dists: dict, sentences: list):
    """Returns groups of sentences for every topic based on the number of topics
    If number of topics > 1, returns a dictionary where for every topic, there is
    a corresponding list of tuples of the form (sentence_number, score) sorted
    in a descending order
    If number of topics = 1, returns a dictionary with 1 item which includes a
    list of tuples of the form (sentence_number, score) sorted in a descending
    order. The score is calculated by dividing the number of common keywords
    by the total number of keywords.
    :param model: LDA model
    :param dists: a list of tuples of the form (topic_number, score)
    :param sentences: a list of sentences in the original text
    :return: Dictionary of topic number and sentences corresponding to the topic
    """
    no_topics = model.num_topics
    if no_topics != 1:
        sentence_groups = {}
        for i in range(no_topics):
            sentence_groups[i] = []
        for i in range(len(dists)):

            key = dists[i][0]
            val = (i, dists[i][1])
            sentence_groups[key].append(val)
        for k, v in sentence_groups.items():
            v.sort(key=lambda x: x[1], reverse=True)
        return sentence_groups
    else:
        scores = {}
        sentence_groups = {}
        keywords = [item[0] for item in model.show_topic(0)]

        for i in range(len(sentences)):
            common_keywords = len(set(sentences[i].split()) & set(keywords))
            score = common_keywords/len(sentences[i].split())
            scores[i] = score
        scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        #print(len(scores))
        #print(scores)
        sentence_groups[0] = scores
        return sentence_groups


def get_summaries(groups: dict, sentences: list):
    """Returns a string of text(summary) for every topic defined by number of topics

    :param groups: Dictionary of topic number and sentences corresponding to the topic
    :param sentences: List of sentences in the original text
    :return: Dictionary of topic number and corresponding extracted summary
    """
    summaries = {}
    for k, v in groups.items():
        no_of_sentences = len(v)
        if no_of_sentences > 10:
            v = v[:10]
        summaries[k] = []
        for sent_index in v:
            summaries[k].append(sentences[sent_index[0]])
    for k, v in summaries.items():
        summaries[k] = ' '.join(v)
    json_summaries = [{'topic': k, 'summary': v} for k, v in summaries.items()]
    json_summaries = json.dumps(json_summaries)
    return json_summaries

def find_entities(summary):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(summary)
    entities = []
    for i in doc.ents:
        if i.label_ in ['EVENT', 'FAC', 'GPE', 'LOC', 'NORP', 'ORG', 'PERSON', 'WORK_OF_ART']:
            entities.append((i.text,i.label_))
    entites_dict = {}
    for i in entities:
        entites_dict[i[0]] = i[1]

    entities_json = json.dumps( entites_dict)
    return entities_json