From 6f9bfeda6625386561eaa0ad51334a9f8d4b1144 Mon Sep 17 00:00:00 2001 From: Francis Bond Date: Mon, 8 Jun 2020 01:22:10 +0800 Subject: [PATCH 01/41] parse tdl from grammar file, closes #2; allow both lkb and ace grammars; many minor bugfixes --- README.rst | 18 ++-- ToDo | 6 ++ gold2db.py | 189 +++++++++++++++++++------------------ html/ltdb.css | 6 +- html/ltdb.py | 36 ++++++- html/ltypes.cgi | 4 +- html/more.cgi | 2 +- html/rules.cgi | 4 +- html/search.cgi | 2 +- html/showtype.cgi | 6 +- make-ltdb.bash | 132 +++++++++++++++++--------- makehome.py | 30 +++++- patch-lextypedb.lsp | 2 +- tables.sql | 13 ++- tdl2db.py | 225 ++++++++++++++++++++++++++++++++------------ xml2db.py | 13 +-- 16 files changed, 449 insertions(+), 239 deletions(-) diff --git a/README.rst b/README.rst index 58231fb..4a36937 100644 --- a/README.rst +++ b/README.rst @@ -17,11 +17,17 @@ the DELPH-IN Wiki. Usage ----- -1. Run ``./make-ltdb.bash --grmdir /path/to/grammar`` +1. Run ``./make-ltdb.bash --script /path/to/grammar/lkb/script`` + +or (somewhat experimental but gets more docstrings) + +2. Run ``./make-ltdb.bash --grmtdl /path/to/grammar/grammar.tdl`` + .. code:: bash - ./make-ltdb.bash --grmdir ~/logon/dfki/jacy + ./make-ltdb.bash --script ~/logon/dfki/jacy/lkb/script + ./make-ltdb.bash --grmtdl ~/logon/dfki/jacy/japanese.tdl Everything is installed to ``~/public_html/`` @@ -43,7 +49,7 @@ Requirements We prefer that Sentence IDs are unique, if we see two sentences in the gold treebank with the same ID, we only store the first one. -Only the new LKB-FOS (http://moin.delph-in.net/LkbFos) suppoorts the new docstring comments. We assume it is installed in +Only the new LKB-FOS (http://moin.delph-in.net/LkbFos) supports the new docstring comments. We assume it is installed in ``LKBFOS=~/delphin/lkb_fos/lkb.linux_x86_64``. Install dependencies (in ubuntu): @@ -114,15 +120,15 @@ Types, instances in the same table, distinguished by status. +==========+====================================+===================+======+ |type |normal type | | | +----------+------------------------------------+-------------------+------+ -|ltype |lexical type |type + in lexicon | _lt | +|lex-type |lexical type |type + in lexicon | _lt | +----------+------------------------------------+-------------------+------+ |lex-entry |lexical entry | | _le | +----------+------------------------------------+-------------------+------+ |rule |syntactic construction/grammar rule | LKB:\*RULES | _c | +----------+------------------------------------+-------------------+------+ -|lrule |lexical rule | LKB:\*LRULES | lr | +|lex-rule | lexical rule | LKB:\*LRULES | lr | +----------+------------------------------------+-------------------+------+ -|irule |inflectional rule | LKB:\*LRULES + | ilr | +|inf-rule |inflectional rule | LKB:\*LRULES + | ilr | +----------+------------------------------------+-------------------+------+ | | (inflectional-rule-pid )| | | +----------+------------------------------------+-------------------+------+ diff --git a/ToDo b/ToDo index b7348f2..65d6c3f 100644 --- a/ToDo +++ b/ToDo @@ -1,3 +1,9 @@ + * look at lisp with John + * prettier lisp + * hyperlinked types + * types without glb + + * Better linking to surface form diff --git a/gold2db.py b/gold2db.py index 50cd3d9..b9d66a6 100644 --- a/gold2db.py +++ b/gold2db.py @@ -1,7 +1,7 @@ #export PYTHONPATH=~/svn/pydelphin # python3 gold2db.py ## -## takes two paramaters -- directory with the xml and database +## takes two paramaters -- directory with the grammar and database ## ## Actually does the lexicon too :-) ## @@ -10,11 +10,8 @@ ## import sqlite3, sys, re, os from collections import defaultdict as dd -from delphin import itsdb -import delphin.mrs -import delphin.derivation -import delphin.mrs.xmrs -import delphin.mrs.simplemrs +from delphin import itsdb, derivation, dmrs +from delphin.codecs import simplemrs, dmrsjson, mrsjson import json if (len(sys.argv) < 3): @@ -41,114 +38,124 @@ mroot=re.compile(r'^\(([-a-zA-z0-9_+]+?)\s+\(') mrule=re.compile(r'\([0-9]+ ([^ ]+) [-0-9.]+ ([0-9]+) ([0-9]+) ') -mlex=re.compile(r'\([0-9]+ ([^ ]+) [-0-9.]+ [0-9]+ [0-9]+ \("(.*?)" ') +#mlex=re.compile(r'\([0-9]+ ([^ ]+) [-0-9.]+ [0-9]+ [0-9]+ \("(.*?)" ') ### make a log in the same directory as the database log = open(os.path.join(os.path.dirname(dbfile),"gold.log"), 'w') - - golddir = '%s/tsdb/gold' % grmdir typefreq=dd(int) # typefreq[type] = freq lexfreq=dd(lambda: dd(int)) # lexfreq[lexid][surf] = freq lxidfreq=dd(lambda: dd(int)) # lxidfreq[typ][lexid] = freq -typind=dd(lambda: dd(set)) # typind[type][sid]((frm, to), ...) -sent=dd(list) # sent[sid][(surf, lexid)] -pname=dict() # pname[sid]=profile +typind=dd(lambda: dd(set)) # typind[type][(profile, sid)]((frm, to), ...) +sent=dd(list) # sent[(profile, sid)][(surf, lexid)] roots=dd(lambda: 'rootless') allroots=set() for root, dirs, files in os.walk(golddir): + #if not root.endswith('e'): for debugging, don't load everything + # continue ### find valid profiles if 'result' in files or 'result.gz' in files: # if 'mrs' not in root: ## debug # continue print("Processing %s" % root, file=sys.stderr) - profile = itsdb.ItsdbProfile(root) - head, profname = os.path.split(root) - items = {} - for row in profile.read_table('item'): - items[row['i-id']] = (row['i-input'], row['i-comment']) - for row in profile.read_table('result'): - pid = row['parse-id'] - pname[pid] = profname - deriv = row['derivation'] # DERIVATION TREE - deriv_json = delphin.derivation.Derivation.from_string(deriv).to_dict(fields=['id','entity','score','form','tokens']) - mrs_string = row['mrs'] - try: - mrs_obj = delphin.mrs.simplemrs.loads(mrs_string, single=True, version=1.1, errors='strict') - # mrs_obj = delphin.mrs.simplemrs.loads(row['mrs'], single=True, version=1.1, strict=False, errors='warn') - # mrs_string = row['mrs'] # CHANGING - mrs_json = delphin.mrs.xmrs.Mrs.to_dict(mrs_obj) - dmrs_json = delphin.mrs.xmrs.Dmrs.to_dict(mrs_obj) - except Exception as e: - log.write("\n\nMRS failed to convert in pydelphin:\n") - log.write("{}: {}\n".format(root, pid)) - log.write(items[pid][0]) - log.write("\n\n") - log.write(str(mrs_string)) - log.write("\n\n") - if hasattr(e, 'message'): - log.write(e.message) - else: - log.write(str(e)) - log.write("\n\n") - mrs_json = dict() - dmrs_json = dict() + ts = itsdb.TestSuite(root) + for response in ts.processed_items(): + sid=response['i-id'] + profile = ts.path.name + if response['readings'] > 0: + try: + first_result=response.result(0) + deriv = first_result.derivation() + mrs_obj=first_result.mrs() + mrs_str = first_result['mrs'] + tree = first_result.get('tree', '') + deriv_str = deriv.to_udf(indent=None) + deriv_json = json.dumps(deriv.to_dict(fields=['id','entity','score','form','tokens'])) + except Exception as e: + log.write("\n\nSomething went wrong getting the result:\n") + log.write("{}: {} {}\n".format(root, profile, sid)) + deriv = '' + mrs_obj = None + mrs_str ='' + tree='' + deriv_str = '' + derv_json = '' + try: + mrs_obj=first_result.mrs() + except Exception as e: + log.write("\n\nMRS couldn't be retrieved in pydelphin:\n") + log.write("{}: {} {}\n".format(root, profile, sid)) + mrs_obj = None + try: + dmrs_obj=dmrs.from_mrs(mrs_obj) + mrs_json = mrsjson.encode(mrs_obj) + dmrs_json = dmrsjson.encode(dmrs_obj) + except Exception as e: + log.write("\n\nMRS failed to convert in pydelphin:\n") + log.write("{}: {} {}\n".format(root, profile, sid)) + log.write(response['i-input']) ### FIXME + log.write("\n\n") + if mrs_obj: + log.write(simplemrs.encode(mrs_obj,indent=True)) + log.write("\n\n") + log.write(repr(e)) + if hasattr(e, 'message'): + log.write(e.message) + # else: + # log.write(str(e)) + log.write("\n\n") + mrs_json = '{}' + dmrs_json = '{}' # STORE gold info IN DB try: - c.execute("""INSERT INTO gold (sid, sent, comment, + c.execute("""INSERT INTO gold (profile, sid, sent, comment, deriv, deriv_json, pst, mrs, mrs_json, dmrs_json, flags) - VALUES (?,?,?,?,?,?,?,?,?,?)""", (pid, items[pid][0], items[pid][1], - deriv, json.dumps(deriv_json), None, - mrs_string, json.dumps(mrs_json), - json.dumps(dmrs_json), None)) - ### ToDo use pydelphin to walk down tree - ### leaves - m = re.findall(mlex,deriv) - lexids=set() - if m: - #print('leaves') - #print(m) - wid =0 - for (lexid, surf) in m: - lexids.add(lexid) + VALUES (?,?,?,?,?,?,?,?,?,?,?)""", + (profile, + sid, + response['i-input'], + response['i-comment'], + deriv_str, + deriv_json, + tree, + mrs_str, + mrs_json, + dmrs_json, + None)) + ##leaves + if deriv: + for (preterminal, terminal) in zip(deriv.preterminals(),deriv.terminals()): + lexid=preterminal.entity + surf=terminal.form + start=preterminal.start + end=preterminal.end lexfreq[lexid][surf] +=1 - sent[pid].append((surf, lexid)) + sent[(profile, sid)].append((surf, lexid)) if ltypes[lexid]: typefreq[ltypes[lexid]] += 1 lxidfreq[ltypes[lexid]][lexid] += 1 - typind[ltypes[lexid]][pid].add((wid, wid+1)) - wid+=1 - ### rules (store as type) - m = re.findall(mrule,deriv) - if m: - for (typ, frm, to) in m: - if typ not in lexids: ## counted these! - typefreq[typ] += 1 - typind[typ][pid].add((frm, to)) - #print('rule') - #print(m) - ### Root (treat as another type) - m = re.search(mroot,deriv) - if m: - #print('root {}'.format(root)) - #print(m.groups()[0]) - #print(deriv) - #print() - roots[pid] = m.groups()[0] + typind[ltypes[lexid]][(profile, sid)].add((start, end)) + ### internal node (store as type) + for node in deriv.internals(): + typ = node.entity + start= node.start + end= node.end + typefreq[typ] += 1 + typind[typ][(profile, sid)].add((start, end)) ##print('\n\n\n') except sqlite3.Error as e: log.write('ERROR: ({}) of type ({}), {}: {}\n'.format(e, type(e).__name__, - root, pid)) + root, sid)) -### each sentence should have a root -for s in sent: - allroots.add(roots[s]) - typind[roots[s]][s].add((0, len(sent[s]))) - typefreq[roots[s]] += 1 +# ### each sentence should have a root +# for s in sent: +# allroots.add(roots[s]) +# typind[roots[s]][s].add((0, len(sent[s]))) +# typefreq[roots[s]] += 1 ### calculate the lexical type frequencies for typ in lxidfreq: @@ -190,20 +197,20 @@ c.execute("""INSERT INTO lexfreq (lexid, word, freq) VALUES (?,?,?)""", (l, w, lexfreq[l][w])) -for s in sent: +for p,s in sent: ##print(s, " ".join([surf for (surf, lexid) in sent[s]])) - for i, (w, l) in enumerate(sent[s]): + for i, (w, l) in enumerate(sent[(p,s)]): c.execute("""INSERT INTO sent (profile, sid, wid, word, lexid) - VALUES (?,?,?,?,?)""", (pname[s], s, i, w, l)) + VALUES (?,?,?,?,?)""", (p, s, i, w, l)) for t in typind: - for s in typind[t]: + for p,s in typind[t]: ##print("%s\t%s\t%s" % (t, s, typind[t][s])) - for (k, m) in typind[t][s]: - c.execute("""INSERT INTO typind (typ, sid, kara, made) - VALUES (?,?,?,?)""", (t, s, k, m)) + for (k, m) in typind[t][(p, s)]: + c.execute("""INSERT INTO typind (typ, profile, sid, kara, made) + VALUES (?,?,?,?,?)""", (t, p, s, k, m)) diff --git a/html/ltdb.css b/html/ltdb.css index 91768b3..15b4b5d 100644 --- a/html/ltdb.css +++ b/html/ltdb.css @@ -117,10 +117,10 @@ tr {background:#F3FFF3} /* FCB likes green */ td {padding:4pt} caption { font-weight:bold; font-size: 18pt;} /* check colors */ -tr.irule {background:#EEEEEE} +tr.inf-rule {background:#EEEEEE} tr.rule {background:#FFAAAA} -tr.lrule {background:#AAAAFF} -tr.ltype {background:#AAFFAA} +tr.lex-rule {background:#AAAAFF} +tr.lex-type {background:#AAFFAA} tr.root {background:#FFAAFF} pre.code { diff --git a/html/ltdb.py b/html/ltdb.py index b249d02..5f35986 100644 --- a/html/ltdb.py +++ b/html/ltdb.py @@ -8,6 +8,7 @@ import sqlite3, collections import cgi, re, urllib, sys from collections import defaultdict as dd +from collections import OrderedDict as od import json @@ -22,6 +23,33 @@ ('nil','nil'):(' ', ' '), (None,None):(' ', ' ')} +### the different kinds of things we deal with +statuses = od() + +##things used when parsing +statuses["lex-rule"] = "Lexical Rules" +statuses["rule"] = "Syntactic Rules" +statuses["token-mapping-rule"] = "Rules for token mapping" +statuses["root"] = "Root Conditions for well formed utterances" + +## Lexical entries +statuses["lex-entry"] = "Lexical Entries" +statuses["generic-lex-entry"] = "Generic Lexical Entries" + +## types +statuses["lex-type"] = "Types for lexical entries (immediate supertypes of lex-entries)" +statuses["type"] = "Other Internal Types" + +## pre and post processing +statuses["lexical-filtering-rule"] = "lexical filtering rule" +statuses["post-generation-mapping-rule"] = "post generation mapping rule" + +## interface +statuses["labels"] = "Labels for trees in the (parse-nodes)" + + + + def getpar (params): par=dict() try: @@ -452,19 +480,19 @@ def searchbar(): -def footer(): +def footer(version): return """
Linguistic Type Database - for the grammar %s; -
By Chikara Hashimoto, Luis Morgado da Costa and Francis Bond; + for the grammar {}; +
By Chikara Hashimoto, Luis Morgado da Costa, Michael Goodman and Francis Bond; Maintained by Francis Bond <bond@ieee.org>;
Source code (GitHub)
-""" % (par['ver']) +""".format(version) def munge_desc(typ,description): diff --git a/html/ltypes.cgi b/html/ltypes.cgi index 0abf4c9..798faee 100755 --- a/html/ltypes.cgi +++ b/html/ltypes.cgi @@ -26,7 +26,7 @@ con = sqlite3.connect(par['db']) c = con.cursor() c.execute("""SELECT types.typ, lname, words, lfreq, cfreq FROM types LEFT JOIN ltypes ON types.typ=ltypes.typ - WHERE status ='ltype' ORDER BY types.typ""") + WHERE status ='lex-type' ORDER BY types.typ""") results = c.fetchall() if results: print """ @@ -60,5 +60,5 @@ if results: print "" -print ltdb.footer() +print (ltdb.footer(par['ver'])) diff --git a/html/more.cgi b/html/more.cgi index b684689..291d0ff 100755 --- a/html/more.cgi +++ b/html/more.cgi @@ -43,4 +43,4 @@ elif(lextyp): else: print("

More examples of what?") print("") -print ltdb.footer() +print ltdb.footer(par['ver']) diff --git a/html/rules.cgi b/html/rules.cgi index 0659ebe..5fb33a6 100755 --- a/html/rules.cgi +++ b/html/rules.cgi @@ -26,7 +26,7 @@ con = sqlite3.connect(par['db']) c = con.cursor() c.execute("""SELECT types.typ, parents, lname, status, freq, arity, head FROM types left join typfreq on types.typ=typfreq.typ - WHERE status in ('rule', 'lrule', 'irule', 'root') order by + WHERE status in ('rule', 'lex-rule', 'inf-rule', 'root') order by types.typ""" ) results = c.fetchall() if results: @@ -72,5 +72,5 @@ if results: print ("") -print (ltdb.footer()) +print (ltdb.footer(par['ver'])) diff --git a/html/search.cgi b/html/search.cgi index 9ef5986..0377e54 100755 --- a/html/search.cgi +++ b/html/search.cgi @@ -108,4 +108,4 @@ elif(typ): """.format(typ, par['ver'])) -print ltdb.footer() +print (ltdb.footer(par['ver'])) diff --git a/html/showtype.cgi b/html/showtype.cgi index 5c0cb32..bef0169 100755 --- a/html/showtype.cgi +++ b/html/showtype.cgi @@ -26,9 +26,9 @@ maxexe = 3 par=ltdb.getpar('params') -print ltdb.header() +print (ltdb.header()) -print ltdb.searchbar() +print (ltdb.searchbar()) @@ -151,5 +151,5 @@ else: print "

Please give me a type (or rule or lexeme)" -print ltdb.footer() +print (ltdb.footer(par['ver'])) diff --git a/make-ltdb.bash b/make-ltdb.bash index 7460b24..235cc23 100755 --- a/make-ltdb.bash +++ b/make-ltdb.bash @@ -7,6 +7,51 @@ echo Welcome to the Linguistic Type Database echo + +### +### get the grammar directory +### + +while [ $# -gt 0 -a "${1#-}" != "$1" ]; do + case ${1} in + --script) + lkbscript=${2}; + shift 2; + ;; + --grmtdl) + grammartdl=${2}; + shift 2; + ;; + *) + echo """You need to give a grammar directory or script file (or both) + --script path/to/lkb/script + --grmtdl path/to/grammar.tdl +""" + exit 0 + esac +done + + +if [ ${lkbscript} ] +then + echo "LKB script file is" ${lkbscript} + grammardir=`dirname ${lkbscript}` + grammardir=`dirname ${grammardir}` + echo "Grammar directory is " ${grammardir} +elif [ ${grammartdl} ] +then + echo "Grammar file is " ${grammartdl} + grammardir=`dirname ${grammartdl}` + echo "Grammar directory is " ${grammardir} +else + echo """You need to give a grammar directory or script file + --script path/to/lkb/script + --grmtdl path/to/grammar.tdl +""" + exit 0 +fi + + # If you want to use LKB_FOS you must set this variable # unset LKBFOS LKBFOS=~/delphin/lkb_fos/lkb.linux_x86_64 @@ -22,20 +67,6 @@ else fi -### -### get the grammar directory -### - -while [ $# -gt 0 -a "${1#-}" != "$1" ]; do - case ${1} in - --grmdir) - grammardir=${2}; - shift 2; - ;; - esac -done - -echo "Grammar directory is " ${grammardir} ### ### set things up @@ -110,17 +141,19 @@ mkdir -p "${outdir}" db=${outdir}/${LTDB_FILE} -### dump the lex-types -echo "Dumping lex-type definitions and lexicon using the LKB (slow but steady)" - - -unset DISPLAY; -unset LUI; - +if [ ${lkbscript} ] +then + ### dump the lex-types + echo "Dumping lex-type definitions and lexicon using the LKB (slow but steady)" + + + unset DISPLAY; + unset LUI; + { cat 2>&1 <<- LISP (format t "~%Read Grammar~%") - (lkb::read-script-file-aux "${grammardir}/lkb/script") + (lkb::read-script-file-aux "${lkbscript}") (lkb::lkb-load-lisp "." "patch-lextypedb.lsp") (format t "~%Output types~%") (lkb::output-types :xml "${outdir}/${TYPES_FILE}") @@ -135,20 +168,21 @@ unset LUI; LISP } | ${LISPCOMMAND} 2>${log} >${log} # } | cat - -### -### Try to validate the types.xml -### -if which xmlstarlet &> /dev/null; then - xmlstarlet val -e ${outdir}/${TYPES_FILE} - xmlstarlet val -e ${outdir}/${RULES_FILE} - xmlstarlet val -e ${outdir}/${LRULES_FILE} - xmlstarlet val -e ${outdir}/${ROOTS_FILE} -else - echo - echo " types files not validated, please install xmlstarlet." - echo " sudo apt-get install xmlstarlet" - echo + + ### + ### Try to validate the types.xml + ### + if which xmlstarlet &> /dev/null; then + xmlstarlet val -e ${outdir}/${TYPES_FILE} + xmlstarlet val -e ${outdir}/${RULES_FILE} + xmlstarlet val -e ${outdir}/${LRULES_FILE} + xmlstarlet val -e ${outdir}/${ROOTS_FILE} + else + echo + echo " types files not validated, please install xmlstarlet." + echo " sudo apt-get install xmlstarlet" + echo + fi fi ### ### make the databases @@ -157,17 +191,25 @@ echo echo "Creating the databases ..." echo -### create the db, write in the -echo "Adding in the info from the lisp" -echo -python3 xml2db.py ${outdir} ${db} +sqlite3 ${db} < tables.sql -echo "Adding in the info from the tdl with pydelphin" -echo -python3 tdl2db.py ${grammardir} ${db} ### add tdl and comments +### +if [ ${lkbscript} ] +then + echo "Adding in the info from the lisp" + echo + python3 xml2db.py ${outdir} ${db} +fi -echo "Adding in the info from the gold trees" -echo +if [ ${grammartdl} ] +then + echo "Adding in the info from the tdl with pydelphin" + echo + python3 tdl2db.py ${grammartdl} ${db} ### add tdl and comments +fi + +#echo "Adding in the info from the gold trees" +#echo python3 gold2db.py ${grammardir} ${db} diff --git a/makehome.py b/makehome.py index 047ccc0..baf82d7 100644 --- a/makehome.py +++ b/makehome.py @@ -9,7 +9,9 @@ import sys, os import datetime from collections import OrderedDict -#import html/ltdb +### get some local utilities +sys.path.append(os.getcwd() + '/html') +from ltdb import statuses, footer (script, version, grmdir) = sys.argv @@ -18,6 +20,7 @@ {0} ltdb +

Welcome to {0}

@@ -77,13 +80,30 @@ print("{}{}".format(a,v)) print("") +### +### Statuses +### +print("""

Types and Instances in the Database

+""") +print("") +for (typ, desc) in statuses.items(): + print(f"") +print("
{typ}{desc}
") +### +### Links to Logs +### print("""

Logs

+""") + +### +### Links to ltdb +### +print("""

Linguistic Type Database