JuliaText · tejasvaidhyadev · Mar 8, 2020 · Mar 8, 2020 · Mar 8, 2020 · Mar 8, 2020
diff --git a/README.md b/README.md
@@ -40,3 +40,4 @@ Follow the links below for full docs on the usage of the corpora.
  - [IMDB movie reviews](docs/src/IMDB.md)
  - [Twitter sentiment dataset](docs/src/Twitter.md)
  - [Stanford Sentiment Treebank](docs/src/SST.md)
+ - [GMB](docs/src/GMB.md)
diff --git a/docs/make.jl b/docs/make.jl
@@ -17,7 +17,8 @@ makedocs(modules = [CorpusLoaders],
              "Twitter" => "Twitter.md",
              "WikiCorpus" => "WikiCorpus.md",
              "WikiGold" => "WikiGold.md",
-             "API References" => "APIReference.md"
+             "API References" => "APIReference.md",
+             "GMB" => "GMB.md"
         ])
 
 

diff --git a/docs/src/GMB.md b/docs/src/GMB.md
@@ -0,0 +1,69 @@
+# GMB
+The dataset an extract from GMB corpus which is tagged, annotated, 
+and built specifically to train the classifier to predict named entities such as name, location, etc. 
+
+GMB is a fairly large corpus with a lot of annotations.
+Unfortunately, GMB is not perfect. It is not a gold standard corpus, meaning that it’s not completely human annotated and it’s not considered 100% correct. 
+The corpus is created by using already existed annotators and then corrected by humans where needed.
+
+The Groningen Meaning Bank (GMB) consists of public domain English texts with corresponding syntactic and semantic representations.
+The GMB is developed at the [University of Groningen](https://www.rug.nl/).
+ A multi-lingual version of the GMB is the [Parallel Meaning Bank](https://pmb.let.rug.nl/). A thorough description of the GMB can be found in the Handbook of Linguistic Annotation.
+
+the part-of-speech tagset used in the Penn Treebank tagset as listed in Ann Taylor, Mitchell Marcus and Beatrice Santorini (2003): [The Penn Treebank](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.9.8216): An Overview, Section 1.1.
+
+
+
+For more detail [refer](https://gmb.let.rug.nl/about.php)
+
+```julia
+
+ Data= load(GMB())
+37789-element Array{Array{PosTaggedWord,1},1}:
+ [PosTaggedWord("NNS", "Families"), PosTaggedWord("IN", "of"), PosTaggedWord("NNS", "soldiers"), PosTaggedWord("VBN", "killed"), PosTaggedWord("IN", "in"), PosTaggedWord("DT", "the"), PosTaggedWord("NN", "conflict"), PosTaggedWord("VBD", "joined"), PosTaggedWord("DT", "the"), PosTaggedWord("NNS", "protesters")  …  PosTaggedWord("CD", "One"), PosTaggedWord("NN", "Terrorist"), PosTaggedWord("RQU", "\""), PosTaggedWord("CC", "and"), PosTaggedWord("LQU", "\""), PosTaggedWord("VB", "Stop"), PosTaggedWord("DT", "the"), PosTaggedWord("NNS", "Bombings"), PosTaggedWord(".", "."), PosTaggedWord("LQU", "\"")]
+
+ [PosTaggedWord("PRP", "They"), PosTaggedWord("VBD", "marched"), PosTaggedWord("IN", "from"), PosTaggedWord("DT", "the"), PosTaggedWord("NNS", "Houses"), PosTaggedWord("IN", "of"), PosTaggedWord("NN", "Parliament"), PosTaggedWord("TO", "to"), PosTaggedWord("DT", "a"), PosTaggedWord("NN", "rally"), PosTaggedWord("IN", "in"), PosTaggedWord("NNP", "Hyde"), PosTaggedWord("NNP", "Park"), PosTaggedWord(".", ".")]
+
+ [PosTaggedWord("NNS", "Police"), PosTaggedWord("VBD", "put"), PosTaggedWord("DT", "the"), PosTaggedWord("NN", "number"), PosTaggedWord("IN", "of"), PosTaggedWord("NNS", "marchers"), PosTaggedWord("IN", "at"), PosTaggedWord("CD", "10,000"), PosTaggedWord("IN", "while"), PosTaggedWord("NNS", "organizers"), PosTaggedWord("VBD", "claimed"), PosTaggedWord("PRP", "it"), PosTaggedWord("VBD", "was"), PosTaggedWord("CD", "100,000"), PosTaggedWord(".", ".")]
+
+  ⋮
+
+ [PosTaggedWord("IN", "At"), PosTaggedWord("JJ", "last"), PosTaggedWord("DT", "the"), PosTaggedWord("NNP", "Goatherd"), PosTaggedWord("VBD", "threw"), PosTaggedWord("DT", "a"), PosTaggedWord("NN", "stone"), PosTaggedWord(",", ","), PosTaggedWord("CC", "and"), PosTaggedWord("VBG", "breaking")  …  PosTaggedWord(",", ","), PosTaggedWord("VBD", "begged"), PosTaggedWord("DT", "the"), PosTaggedWord("NNP", "Goat"), PosTaggedWord("RB", "not"), PosTaggedWord("TO", "to"), PosTaggedWord("VB", "tell"), PosTaggedWord("PRP\$", "his"), PosTaggedWord("NN", "master"), PosTaggedWord(".", ".")]
+
+ [PosTaggedWord("DT", "The"), PosTaggedWord("NNP", "Goat"), PosTaggedWord("VBD", "replied"), PosTaggedWord(",", ","), PosTaggedWord("LQU", "\""), PosTaggedWord("WRB", "Why"), PosTaggedWord(",", ","), PosTaggedWord("PRP", "you"), PosTaggedWord("JJ", "silly"), PosTaggedWord("NN", "fellow")  …  PosTaggedWord("DT", "the"), PosTaggedWord("NN", "horn"), PosTaggedWord("MD", "will"), PosTaggedWord("VB", "speak"), PosTaggedWord("IN", "though"), PosTaggedWord("PRP", "I"), PosTaggedWord("VB", "be"), PosTaggedWord("JJ", "silent"), PosTaggedWord(".", "."), PosTaggedWord("LQU", "\"")]
+
+ [PosTaggedWord("VBP", "Do"), PosTaggedWord("RB", "not"), PosTaggedWord("VB", "attempt"), PosTaggedWord("TO", "to"), PosTaggedWord("VB", "hide"), PosTaggedWord("NNS", "things"), PosTaggedWord("WDT", "which"), PosTaggedWord("MD", "can"), PosTaggedWord("RB", "not"), PosTaggedWord("VB", "be"), PosTaggedWord("JJ", "hid"), PosTaggedWord(".", ".")] 
+
+julia> Data[1]
+30-element Array{PosTaggedWord,1}:
+ PosTaggedWord("NNS", "Families")
+ PosTaggedWord("IN", "of")
+ PosTaggedWord("NNS", "soldiers")
+ PosTaggedWord("VBN", "killed")
+ PosTaggedWord("IN", "in")
+ PosTaggedWord("DT", "the")
+ PosTaggedWord("NN", "conflict")
+ PosTaggedWord("VBD", "joined")
+ PosTaggedWord("DT", "the")
+ PosTaggedWord("NNS", "protesters")
+ PosTaggedWord("WP", "who")
+ PosTaggedWord("VBD", "carried")
+ PosTaggedWord("NNS", "banners") 
+ PosTaggedWord("IN", "with") 
+ ⋮
+ PosTaggedWord("IN", "as")
+ PosTaggedWord("LQU", "\"")
+ PosTaggedWord("NNP", "Bush")
+ PosTaggedWord("NN", "Number")
+ PosTaggedWord("CD", "One") 
+ PosTaggedWord("NN", "Terrorist")
+ PosTaggedWord("RQU", "\"") 
+ PosTaggedWord("CC", "and") 
+ PosTaggedWord("LQU", "\"") 
+ PosTaggedWord("VB", "Stop")
+ PosTaggedWord("DT", "the") 
+ PosTaggedWord("NNS", "Bombings")
+ PosTaggedWord(".", ".") 
+ PosTaggedWord("LQU", "\"") 
+
+```
diff --git a/src/CorpusLoaders.jl b/src/CorpusLoaders.jl
@@ -11,7 +11,7 @@ export Document, TaggedWord, SenseAnnotatedWord, PosTaggedWord
 export title, sensekey, word, named_entity, part_of_speech
 export load
 
-export WikiCorpus, SemCor, Senseval3, CoNLL, IMDB, Twitter, StanfordSentimentTreebank, WikiGold, CoNLL2000
+export WikiCorpus, SemCor, Senseval3, CoNLL, IMDB, Twitter, StanfordSentimentTreebank, WikiGold, CoNLL2000, GMB
 
 function __init__()
     include(joinpath(@__DIR__, "WikiCorpus_DataDeps.jl"))
@@ -24,6 +24,7 @@ function __init__()
     include(joinpath(@__DIR__, "StanfordSentimentTreebank_DataDeps.jl"))
     include(joinpath(@__DIR__, "WikiGold_DataDeps.jl"))
     include(joinpath(@__DIR__, "CoNLL2000_DataDeps.jl"))
+    include(joinpath(@__DIR__, "GMB_DataDeps.jl"))
 end
 
 include("types.jl")
@@ -38,5 +39,5 @@ include("Twitter.jl")
 include("StanfordSentimentTreebank.jl")
 include("WikiGold.jl")
 include("CoNLL2000.jl")
-
+include("GMB.jl")
 end
diff --git a/src/GMB.jl b/src/GMB.jl
@@ -0,0 +1,57 @@
+struct GMB{S}
+    filepath :: Vector{S}
+end
+
+function GMB(dirpath)
+    @assert(isdir(dirpath), dirpath)
+    paths = glob("data/*/*/en.tags",dirpath)
+    GMB(paths)
+end
+
+GMB() = GMB(datadep"GMB 2.2.0")
+
+MultiResolutionIterators.levelname_map(::Type{GMB}) = [
+    :doc=>1, :contextfile=>1, :context=>1, :document=>1,
+    :para=>2, :paragraph=>2,
+    :sent=>3, :sentence=>3,
+    :word=>4, :token=>4,
+    :char=>5, :character=>5
+    ]
+
+function parse_gmb_tagged_word(line::AbstractString)
+    tokens_tags = split(line, '\t')
+    return PosTaggedWord(tokens_tags[2], tokens_tags[1])
+end
+
+function parse_gmb(filename)
+    local sent = []
+    sents = @NestedVector(PosTaggedWord, 2)()
+
+    function new_sentence()
+        sent = @NestedVector(PosTaggedWord, 1)()
+        push!(sents, sent)
+    end
+
+
+    # words
+    get_tagged(line) = push!(sent, parse_gmb_tagged_word(line))
+
+    # parse
+    for line in eachline(filename)
+        if length(line) == 0
+            new_sentence()
+        else
+            get_tagged(line)
+        end
+    end
+    return sents
+end
+
+function load(corpus::GMB)
+    ch = @NestedVector(PosTaggedWord, 2)()
+    for fn in corpus.filepath
+        document = parse_gmb(fn)
+        append!(ch, document)
+    end
+    return(ch)
+end
diff --git a/src/GMB_DataDeps.jl b/src/GMB_DataDeps.jl
@@ -0,0 +1,33 @@
+using DataDeps
+
+
+for (ver, checksum) in [("1.0.0", "e151d953a0316c5712a52d56a5702f24cc1dc8f22425955821113437ec43a3b8"),
+            ("1.1.0", "3830e7071e43ca9e659d51f2f7c5e5afea9e233993251e9f45d628caa6a372c6"),
+            ("2.0.0", "30a700e2509eb1a484357a1f1e5f7f06ef8e9516267413061b7dfccdf8ba4215"),
+            ("2.1.0", "e4bd7d43f7b2c1618f896784c2b7df3acde3bfe93ef4fd6e5a7a196f54b6a4f9"),
+            ("2.2.0", "dd12f2617f745ea3cafa348c60ee374c804be238d184bcf91db7bd9f90261625")]
+
+    register(DataDep("GMB $ver",
+        """
+        Website: https://gmb.let.rug.nl/data.php
+        Orignal Author: Bos, Johan and Basile, Valerio and Evang, Kilian and Venhuizen, Noortje and Bjerva, Johannes
+
+        The Groningen Meaning Bank (GMB) consists of public domain English texts with corresponding syntactic and semantic representations.
+        The GMB is developed at the University of Groningen. A multi-lingual version of the GMB is the Parallel Meaning Bank. 
+        A thorough description of the GMB can be found in the Handbook of Linguistic Annotation.
+
+        Please cite the following publication if you use the corpora:
+        Bos, Johan and Basile, Valerio and Evang, Kilian and Venhuizen, Noortje and Bjerva, Johannes. " Handbook of Linguistic Annotation, Publisher: Springer Netherlands, Editors: Nancy Ide, James Pustejovsky, pp.463-496."
+        """,
+        "https://gmb.let.rug.nl/releases/gmb-$(ver).zip",
+        checksum;
+        post_fetch_method = function (fn)
+            unpack(fn)
+            innerdir = "gbm-$(ver)"
+            innerfiles = readdir(innerdir)
+            # Move everything to current directory, under same name
+            mv.(joinpath.(innerdir, innerfiles), innerfiles)
+            rm(innerdir)
+        end
+    ))
+end
diff --git a/test/test_GMB.jl b/test/test_GMB.jl
@@ -0,0 +1,21 @@
+using CorpusLoaders
+using Test
+using Base.Iterators
+using MultiResolutionIterators
+using DataDeps
+
+@testset "Using flatten_levels" for path in [datadep"GMB 1.0.0", datadep"GMB 1.1.0", datadep"GMB 2.0.0", datadep"GMB 2.1.0", datadep"GMB 2.2.0"]
+    train = load(GMB())
+    docs = train[1:5]
+
+    words = full_consolidate(flatten_levels(docs, (!lvls)(CoNLL, :word)))
+    @test length(words) > length(docs)
+    @test typeof(words) == Vector{CorpusLoaders.NerOnlyTaggedWord}
+
+    plain_words = word.(words)
+    @test typeof(plain_words) <: Vector{String}
+
+    ner_tags = named_entity.(words)
+    @test typeof(ner_tags) <: Vector{String}
+
+end