From f052eb3859fddab0aac95d5e30a2939ee5ebc4a6 Mon Sep 17 00:00:00 2001 From: James Mertens Date: Wed, 22 Mar 2017 18:58:17 -0400 Subject: [PATCH 1/4] Create parse_wca.py --- datasets/wca/parse_wca.py | 63 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 datasets/wca/parse_wca.py diff --git a/datasets/wca/parse_wca.py b/datasets/wca/parse_wca.py new file mode 100644 index 0000000..25f408a --- /dev/null +++ b/datasets/wca/parse_wca.py @@ -0,0 +1,63 @@ + +import csv +import urllib +import zipfile +import numpy + +urllib.urlretrieve('https://www.worldcubeassociation.org/results/misc/WCA_export.tsv.zip', 'WCA_export.tsv.zip') +zip_ref = zipfile.ZipFile('WCA_export.tsv.zip', 'r') +zip_ref.extract('WCA_export_Results.tsv') +zip_ref.close() + +with open('WCA_export_Results.tsv', 'r') as f: + reader = csv.reader(f, delimiter="\t") + data = list(reader) + +data = data[1:] # First line is a header + +# dicts +Competitions = dict() +Events = dict() +Rounds = dict() +Persons = dict() + +# Tensor +Results = [] + +def getOrSetDictVal( _key, _dict ): + if _key in _dict : + return _dict[_key] + else: + _dict[_key] = len(_dict) + 1 + return _dict[_key] + +for entry in data : + _competition = getOrSetDictVal(entry[0], Competitions) + _event = getOrSetDictVal(entry[1], Events) + _round = getOrSetDictVal(entry[2], Rounds) + _person = getOrSetDictVal(entry[7], Persons) + + if int(entry[5]) > 0 : # use average if average > 0 + _time = int(entry[5]) + elif int(entry[4]) > 0 : # use best if best > 0 + _time = int(entry[4]) + else : + _time = 0 + + if(_time > 0) : + _result = (_competition, _event, _round, _person, _time); + Results.append(_result) + +# Save tensor file +numpy.savetxt("WCA_Results.tns", Results, delimiter=" ", fmt='%i') + +def writeMap(_dict, _file) : + with open(_file, 'w') as f: + for _key in sorted(_dict, key=_dict.get) : + f.write('%s\n' % (_key)) + + +writeMap(Competitions, "mode-1-competitions.map") +writeMap(Events, "mode-2-events.map") +writeMap(Rounds, "mode-3-rounds.map") +writeMap(Persons, "mode-4-persons.map") From 7f2df4ae01fafd4c7b57b0af739da3c768a4cd23 Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Thu, 23 Mar 2017 10:47:06 -0500 Subject: [PATCH 2/4] Specify python2 and ensure that only valid competitors get IDs. --- datasets/wca/parse_wca.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) mode change 100644 => 100755 datasets/wca/parse_wca.py diff --git a/datasets/wca/parse_wca.py b/datasets/wca/parse_wca.py old mode 100644 new mode 100755 index 25f408a..4a828e4 --- a/datasets/wca/parse_wca.py +++ b/datasets/wca/parse_wca.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python2 import csv import urllib @@ -32,19 +33,21 @@ def getOrSetDictVal( _key, _dict ): return _dict[_key] for entry in data : - _competition = getOrSetDictVal(entry[0], Competitions) - _event = getOrSetDictVal(entry[1], Events) - _round = getOrSetDictVal(entry[2], Rounds) - _person = getOrSetDictVal(entry[7], Persons) + _time = 0 if int(entry[5]) > 0 : # use average if average > 0 _time = int(entry[5]) elif int(entry[4]) > 0 : # use best if best > 0 _time = int(entry[4]) - else : - _time = 0 - if(_time > 0) : + if _time > 0: + # A few WCA IDs have no times associated with them, so make sure that + # they are not added to the tensor. + _competition = getOrSetDictVal(entry[0], Competitions) + _event = getOrSetDictVal(entry[1], Events) + _round = getOrSetDictVal(entry[2], Rounds) + _person = getOrSetDictVal(entry[7], Persons) + _result = (_competition, _event, _round, _person, _time); Results.append(_result) @@ -56,8 +59,9 @@ def writeMap(_dict, _file) : for _key in sorted(_dict, key=_dict.get) : f.write('%s\n' % (_key)) +writeMap(Competitions, 'mode-1-competitions.map') +writeMap(Events, 'mode-2-events.map') +writeMap(Rounds, 'mode-3-rounds.map') +writeMap(Persons, 'mode-4-persons.map') + -writeMap(Competitions, "mode-1-competitions.map") -writeMap(Events, "mode-2-events.map") -writeMap(Rounds, "mode-3-rounds.map") -writeMap(Persons, "mode-4-persons.map") From f2399133ca06328964e808a344903ec702fb719c Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Thu, 23 Mar 2017 12:01:31 -0500 Subject: [PATCH 3/4] competition IDs are now sorted temporally --- datasets/wca/parse_wca.py | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/datasets/wca/parse_wca.py b/datasets/wca/parse_wca.py index 4a828e4..b0e9a2f 100755 --- a/datasets/wca/parse_wca.py +++ b/datasets/wca/parse_wca.py @@ -1,20 +1,23 @@ #!/usr/bin/env python2 + +# +# NOTE: Due to some competitions without results, this script will still +# generate a tensor with some gaps in the competition mode. +# + import csv import urllib import zipfile +import datetime import numpy urllib.urlretrieve('https://www.worldcubeassociation.org/results/misc/WCA_export.tsv.zip', 'WCA_export.tsv.zip') zip_ref = zipfile.ZipFile('WCA_export.tsv.zip', 'r') zip_ref.extract('WCA_export_Results.tsv') +zip_ref.extract('WCA_export_Competitions.tsv') zip_ref.close() -with open('WCA_export_Results.tsv', 'r') as f: - reader = csv.reader(f, delimiter="\t") - data = list(reader) - -data = data[1:] # First line is a header # dicts Competitions = dict() @@ -22,6 +25,30 @@ Rounds = dict() Persons = dict() + +# Parse competition dates and assign contiguous IDs +Competition_dates = dict() +with open('WCA_export_Competitions.tsv', 'r') as f: + reader = csv.reader(f, delimiter="\t") + reader.next() # skip header + for line in reader: + comp_id = line[0] + year = int(line[5]) + month = int(line[6]) + day = int(line[7]) + Competition_dates[comp_id] = datetime.date(year, month, day) + +# Assign contiguous competition IDs, sorted by date +for comp in sorted(Competition_dates, key=Competition_dates.get) : + Competitions[comp] = len(Competitions) + 1 + +# Read solve data +with open('WCA_export_Results.tsv', 'r') as f: + reader = csv.reader(f, delimiter="\t") + data = list(reader) + +data = data[1:] # First line is a header + # Tensor Results = [] From 25fa3f9d6e5ce947e97c218aab44f1b2ac0fc7a6 Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Thu, 23 Mar 2017 14:11:10 -0500 Subject: [PATCH 4/4] trying out just 333 modeling --- datasets/wca/parse_wca.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/datasets/wca/parse_wca.py b/datasets/wca/parse_wca.py index b0e9a2f..7810214 100755 --- a/datasets/wca/parse_wca.py +++ b/datasets/wca/parse_wca.py @@ -67,7 +67,18 @@ def getOrSetDictVal( _key, _dict ): elif int(entry[4]) > 0 : # use best if best > 0 _time = int(entry[4]) + # FMC and MBLD have different result formats + if (entry[1] == '333fm') or (entry[1] == '333mbf'): + continue + + # XXX + # only do 333 for now + if entry[1] != '333': + continue + if _time > 0: + _time = float(_time) / 100. # convert back to seconds + # A few WCA IDs have no times associated with them, so make sure that # they are not added to the tensor. _competition = getOrSetDictVal(entry[0], Competitions) @@ -75,11 +86,13 @@ def getOrSetDictVal( _key, _dict ): _round = getOrSetDictVal(entry[2], Rounds) _person = getOrSetDictVal(entry[7], Persons) - _result = (_competition, _event, _round, _person, _time); + #_result = (_competition, _event, _round, _person, _time); + _result = (_competition, _round, _person, _time); Results.append(_result) # Save tensor file -numpy.savetxt("WCA_Results.tns", Results, delimiter=" ", fmt='%i') +#numpy.savetxt("WCA_Results.tns", Results, fmt='%u %u %u %u %0.2f') +numpy.savetxt("WCA_Results.tns", Results, fmt='%u %u %u %0.2f') def writeMap(_dict, _file) : with open(_file, 'w') as f: