Skip to content

Commit b1dc9d7

Browse files
committed
start of dwc output
1 parent 49aeedb commit b1dc9d7

File tree

3 files changed

+102
-3
lines changed

3 files changed

+102
-3
lines changed

otc/taxonomy/taxonomy.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,7 @@ void copy_file_two_tries(const fs::path & old_path, const fs::path & new_path) {
332332
}
333333
}
334334

335+
335336
void Taxonomy::write(const std::string& newdirname,
336337
bool copy_taxonomy_tsv_lines_raw,
337338
bool copy_synonyms_tsv_raw
@@ -341,11 +342,10 @@ void Taxonomy::write(const std::string& newdirname,
341342
if (! fs::exists(new_dir)) {
342343
fs::create_directories(new_dir);
343344
}
344-
345-
// Copy the other files.
345+
// Copy the other files.
346346
for(const auto& name: {"about.json", "conflicts.tsv", "deprecated.tsv",
347347
"log.tsv", "otu_differences.tsv", "weaklog.csv"}) {
348-
copy_file_two_tries(old_dir/name,new_dir/name);
348+
copy_file_two_tries(old_dir/name, new_dir/name);
349349
}
350350

351351
const auto fname = "synonyms.tsv";

otc/taxonomy/taxonomy.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,7 @@ class Taxonomy: public std::vector<TaxonomyRecord>, public BaseTaxonomy {
262262
private:
263263
unsigned int read_input_taxonomy_stream(std::istream & taxonomy_stream);
264264
unsigned int read_ott_taxonomy_stream(std::istream & taxonomy_stream);
265+
265266
friend class RichTaxonomy;
266267
};
267268

tools/taxonomy-parser.cpp

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
#include <boost/program_options.hpp>
88
#include <bitset>
99
#include <regex>
10+
#include <filesystem>
11+
#include <ostream>
1012

1113
#include "otc/error.h"
1214
#include "otc/tree.h"
@@ -17,6 +19,7 @@
1719
#include "otc/config_file.h"
1820

1921
using namespace otc;
22+
namespace fs = std::filesystem;
2023

2124
using std::string;
2225
using std::vector;
@@ -72,6 +75,7 @@ variables_map parse_cmd_line(int argc,char* argv[]) {
7275
("high-degree-nodes",value<int>(),"Show the top <arg> high-degree nodes")
7376
("write-tree,T","Write out the result as a tree")
7477
("write-taxonomy",value<string>(),"Write out the result as a taxonomy to directory 'arg'")
78+
("write-dwca",value<string>(),"Write out the result as a taxonomy to directory 'arg' as a Darwin Core Archive")
7579
("name,N", value<OttId>(), "Return name of the given ID")
7680
("uniqname,U", value<OttId>(), "Return unique name for the given ID")
7781
("report-lost-taxa",value<string>(), "A tree to report missing taxa for")
@@ -283,6 +287,93 @@ std::function<bool(tax_flags)> get_flags_match(variables_map& args) {
283287
}
284288
}
285289

290+
void write_eml_xml(const std::string ofn) {
291+
string content = "<?xml version=\"1.0\"?>"
292+
"<eml:eml xmlns:eml=\"eml://ecoinformatics.org/eml-2.1.1\" xmlns:md=\"eml://ecoinformatics.org/methods-2.1.1\" xmlns:proj=\"eml://ecoinformatics.org/project-2.1.1\" xmlns:d\"=\"eml://ecoinformatics.org/dataset-2.1.1\" xmlns:res=\"eml://ecoinformatics.org/resource-2.1.1\" xmlns:dc=\"http://purl.org/dc/terms/\" xmlns:xsi=\"http://www.w3.org/\"2001/XMLSchema-instance\" packageId=\"/2020-5-30::0:53:12\" system=\"http://globalnames.org\" xml:lang=\"en\" xsi:schemaLocation=\"eml://ecoinformatics.org/eml-2.1.1 \"http://rs.gbif.org/schema/eml-gbif-profile/1.0.1/eml.xsd\">\n"
293+
" <dataset id=\"\">\n"
294+
" <title>Open Tree of Life Taxonomy</title>\n"
295+
" <license/>\n"
296+
" <metadataProvider>\n"
297+
" <individualName>\n"
298+
" <givenName>Mark</givenName>\n"
299+
" <surName>Holder</surName>\n"
300+
" </individualName>\n"
301+
" <electronicMailAddress>mtholder@gmail.com</electronicMailAddress>\n"
302+
" </metadataProvider>\n"
303+
" <pubDate></pubDate>\n"
304+
" <abstract>\n"
305+
" <para></para>\n"
306+
" </abstract>\n"
307+
" <contact>\n"
308+
" <references>1</references>\n"
309+
" </contact>\n"
310+
" </dataset>\n"
311+
" <additionalMetadata>\n"
312+
" <metadata>\n"
313+
" <citation/>\n"
314+
" </metadata>\n"
315+
" </additionalMetadata>\n"
316+
"</eml:eml>\n";
317+
std::ofstream tf (ofn);
318+
tf << content;
319+
tf.close();
320+
}
321+
322+
void write_meta_xml(const std::string ofn) {
323+
string content = "<?xml version=\"1.0\"?>\n"
324+
"<archive xmlns=\"http://rs.tdwg.org/dwc/text/\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://rs.tdwg.org/dwc/terms/xsd/archive/ http://darwincore.googlecode.com/svn/trunk/text/tdwg_dwc_text.xsd\">\n"
325+
" <core encoding=\"UTF-8\" fieldsTerminatedBy=\",\" fieldsEnclosedBy=\"&quot;\" linesTerminatedBy=\"&#10;\" rowType=\"http://rs.tdwg.org/dwc/terms/Taxon\" ignoreHeaderLines=\"1\">\n"
326+
" <files>\n"
327+
" <location>taxa.txt</location>\n"
328+
" </files>\n"
329+
" <id index=\"0\"/>\n"
330+
" <field term=\"http://rs.tdwg.org/dwc/terms/scientificName\" index=\"1\"/>\n"
331+
" <field term=\"http://rs.tdwg.org/dwc/terms/kingdom\" index=\"2\"/>\n"
332+
" <field term=\"http://rs.tdwg.org/dwc/terms/phylum\" index=\"3\"/>\n"
333+
" <field term=\"http://rs.tdwg.org/dwc/terms/class\" index=\"4\"/>\n"
334+
" <field term=\"http://rs.tdwg.org/dwc/terms/order\" index=\"5\"/>\n"
335+
" <field term=\"http://rs.tdwg.org/dwc/terms/family\" index=\"6\"/>\n"
336+
" <field term=\"http://rs.tdwg.org/dwc/terms/genus\" index=\"7\"/>\n"
337+
" <field term=\"http://rs.tdwg.org/dwc/terms/nomenclaturalCode\" index=\"8\"/>\n"
338+
" </core>\n"
339+
"</archive>\n";
340+
std::ofstream tf (ofn);
341+
tf << content;
342+
tf.close();
343+
}
344+
345+
void write_taxa_txt(const std::string ofn,
346+
const Taxonomy & taxonomy,
347+
const Tree_t & the_tree ){
348+
// <id index="0"/>
349+
// <field term="http://rs.tdwg.org/dwc/terms/scientificName" index="1"/>
350+
// <field term="http://rs.tdwg.org/dwc/terms/kingdom" index="2"/>
351+
// <field term="http://rs.tdwg.org/dwc/terms/phylum" index="3"/>
352+
// <field term="http://rs.tdwg.org/dwc/terms/class" index="4"/>
353+
// <field term="http://rs.tdwg.org/dwc/terms/order" index="5"/>
354+
// <field term="http://rs.tdwg.org/dwc/terms/family" index="6"/>
355+
// <field term="http://rs.tdwg.org/dwc/terms/genus" index="7"/>
356+
// <field term="http://rs.tdwg.org/dwc/terms/nomenclaturalCode" index="8"/>
357+
std::ofstream tf (ofn);
358+
for (auto nd : iter_pre_const(the_tree)) {
359+
if (nd->has_ott_id()) {
360+
cout << nd->get_ott_id() << "\n";
361+
}
362+
}
363+
tf.close();
364+
}
365+
366+
void write_taxonomy_as_dwca(const std::string & dir,
367+
const Taxonomy & taxonomy,
368+
const Tree_t & the_tree ) {
369+
fs::path new_dir = dir;
370+
if (! fs::exists(new_dir)) {
371+
fs::create_directories(new_dir);
372+
}
373+
write_eml_xml((new_dir/"eml.xml").string());
374+
write_meta_xml((new_dir/"meta.xml").string());
375+
write_taxa_txt((new_dir/"taxa.txt").string(), taxonomy, the_tree);
376+
}
286377

287378
int main(int argc, char* argv[])
288379
{
@@ -376,6 +467,13 @@ int main(int argc, char* argv[])
376467
if (args.count("write-taxonomy")) {
377468
taxonomy.write(args["write-taxonomy"].as<string>(), false, !root_changed);
378469
}
470+
if (args.count("write-dwca")) {
471+
//taxonID,scientificName,kingdom,phylum,class,order,family,genus,nomenclaturalCode
472+
auto nodeNamer = [](const auto& record){return string(record.name)+"_ott"+std::to_string(record.id);};
473+
const auto the_tree_ptr = taxonomy.get_tree<Tree_t>(nodeNamer);
474+
const Tree_t & the_tree = *the_tree_ptr;
475+
write_taxonomy_as_dwca(args["write-dwca"].as<string>(), taxonomy, the_tree );
476+
}
379477
if (args.count("name")) {
380478
OttId id = args["name"].as<OttId>();
381479
std::cout << taxonomy.record_from_id(id).name << std::endl;

0 commit comments

Comments
 (0)