From 3870bb263bcd5d78226fc16070867ac654da8f73 Mon Sep 17 00:00:00 2001 From: Ilia Kulikov Date: Fri, 29 Dec 2017 22:19:54 -0500 Subject: [PATCH 1/3] Some fixes in LM part regarding ngram history length + MLE ngram --- lecture_note.tex | 106 +++++++++++++++++++++++++++++------------------ 1 file changed, 66 insertions(+), 40 deletions(-) diff --git a/lecture_note.tex b/lecture_note.tex index d260d79..0f3ec83 100644 --- a/lecture_note.tex +++ b/lecture_note.tex @@ -3555,7 +3555,7 @@ \section{$n$-Gram Language Model} limit the maximum length of phrases/sentences we estimate a probability on}. This idea is a foundation on which a so-called $n$-gram language model is based. -In the $n$-gram language model, we first rewrite the probability of a given +In the $n$-gram language model, using the chain rule we firstly rewrite the probability of a given sentence $S$ from Eq.~\eqref{eq:sentence_prob} into \begin{align} \label{eq:unidir_sentence} @@ -3568,11 +3568,12 @@ \section{$n$-Gram Language Model} conditional probability (Eq.~\eqref{eq:unidir_sentence}~(a)) is only conditioned on the $n-1$ preceding symbols only, meaning \begin{align*} - p(w_k | w_{ 0 \\ - \gamma(w_{k-n+1}, \ldots, w_{k}) p^S(w_{k}|w_{k-n+1}, \ldots, + \gamma(w_{k-n+2}, \ldots, w_{k}) p^S(w_{k}|w_{k-n+2}, \ldots, w_{k-1}), \text{ otherwise} \end{array} \right. @@ -3784,17 +3810,17 @@ \subsection{Smoothing and Back-Off} Also, let us define the following quantities describing the number of all possible words following a given $n$-gram with a specified frequency $l$: \begin{align*} - N_l(w_{k-n}, \ldots, w_{k-1}) = |\{ c(w_{k-n}, \ldots, w_{k-1}, w_k) = l \}| + N_l(w_{k-n+1}, \ldots, w_{k-1}) = |\{ c(w_{k-n+1}, \ldots, w_{k-1}, w_k) = l \}| \end{align*} The modified KN smoothing then defines $\alpha$ in Eq.~\eqref{eq:n_gram_smoothing_general} to be \begin{align*} - \alpha(w_k | w_{k-n}, \ldots, w_{k-1}) = + \alpha(w_k | w_{k-n+1}, \ldots, w_{k-1}) = \frac{ - c(w_{k-n}, \ldots, w_{k-1}, w_k) - D(c(w_{k-n}, \ldots, w_{k-1}, w_k)) + c(w_{k-n+1}, \ldots, w_{k-1}, w_k) - D(c(w_{k-n+1}, \ldots, w_{k-1}, w_k)) }{ - \sum_{w' \in V} c(w_{k-n}, \ldots, w_{k-1}, w') + \sum_{w' \in V} c(w_{k-n+1}, \ldots, w_{k-1}, w') }, \end{align*} where $D$ is @@ -3810,13 +3836,13 @@ \subsection{Smoothing and Back-Off} \end{align*} And, $\gamma$ is defined as \begin{align*} - \gamma(w_{k-n}, \ldots, w_{k-1}) = + \gamma(w_{k-n+1}, \ldots, w_{k-1}) = \frac{ - D_1 N_1(w_{k-n}, \ldots, w_{k-1}) - + D_2 N_2(w_{k-n}, \ldots, w_{k-1}) - + D_{3+} N_{3+}(w_{k-n}, \ldots, w_{k-1}) + D_1 N_1(w_{k-n+1}, \ldots, w_{k-1}) + + D_2 N_2(w_{k-n+1}, \ldots, w_{k-1}) + + D_{3+} N_{3+}(w_{k-n+1}, \ldots, w_{k-1}) }{ - \sum_{w' \in V} c(w_{k-n}, \ldots, w_{k-1}, w') + \sum_{w' \in V} c(w_{k-n+1}, \ldots, w_{k-1}, w') }. \end{align*} @@ -3904,11 +3930,11 @@ \section{Neural Language Model} One thing we notice from $n$-gram language modelling is that this boils down to computing the conditional distribution of a next word $w_k$ given $n-1$ -preceding words $w_{k-n}, \ldots, w_{k-1}$. In other words, the goal of $n$-gram +preceding words $w_{k-n+1}, \ldots, w_{k-1}$. In other words, the goal of $n$-gram language modeling is to find a function that takes as input $n-1$ words and returns a conditional probability of a next word: \begin{align*} - p(w_k | w_{k-n}, \ldots, w_{k-1}) = f_{\TT}^{w_k} (w_{k-n}, \ldots, + p(w_k | w_{k-n+1}, \ldots, w_{k-1}) = f_{\TT}^{w_k} (w_{k-n+1}, \ldots, w_{k-1}). \end{align*} This is almost exactly what we have learned in From 0b5ad71f464a12b4c90e95007fc896e1717a2e6c Mon Sep 17 00:00:00 2001 From: Ilia Kulikov Date: Fri, 29 Dec 2017 22:50:31 -0500 Subject: [PATCH 2/3] wipe comments, + such as --- lecture_note.tex | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lecture_note.tex b/lecture_note.tex index 0f3ec83..22878fa 100644 --- a/lecture_note.tex +++ b/lecture_note.tex @@ -3568,12 +3568,11 @@ \section{$n$-Gram Language Model} conditional probability (Eq.~\eqref{eq:unidir_sentence}~(a)) is only conditioned on the $n-1$ preceding symbols only, meaning \begin{align*} - % p(w_k | w_{ Date: Fri, 29 Dec 2017 22:52:16 -0500 Subject: [PATCH 3/3] such an --- lecture_note.tex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lecture_note.tex b/lecture_note.tex index 22878fa..96dbfcc 100644 --- a/lecture_note.tex +++ b/lecture_note.tex @@ -3685,7 +3685,7 @@ \subsection{Smoothing and Back-Off} travel.} The biggest issue of having an $n$-gram that never occurs in the training corpus -is that any sentence containing such as $n$-gram will be given a zero probability +is that any sentence containing such an $n$-gram will be given a zero probability regardless of how likely all the other $n$-grams are. Let us continue with the example of ``I like llama''. With an $n$-gram language model built using all the books in Google Books, the following, totally valid sentence\footnote{