Skip to content

Commit e525eec

Browse files
committed
Add import cleanup
1 parent 05ae5c4 commit e525eec

File tree

6 files changed

+142
-41
lines changed

6 files changed

+142
-41
lines changed

importer/src/hierarchy.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,27 @@ void Hierarchy::add_linked_item(std::shared_ptr<HierarchyItem> &item)
5555
tolink->second->add_linked(item);
5656
}
5757

58+
void Hierarchy::cleanup()
59+
{
60+
for (auto root_iter = m_root.begin(); root_iter != m_root.end(); ++root_iter)
61+
{
62+
std::set<std::shared_ptr<HierarchyItem> > keep;
63+
for (auto item : root_iter->second)
64+
{
65+
item->cleanup_children();
66+
if (item->keep())
67+
keep.insert(item);
68+
else
69+
keep.insert(item->children().begin(), item->children().end());
70+
}
71+
root_iter->second = keep;
72+
73+
// ensure that the parent is set correctly
74+
for (auto item : root_iter->second)
75+
item->set_parent(root_iter->first, true);
76+
}
77+
}
78+
5879
void Hierarchy::set_country(const std::string &country, hindex id)
5980
{
6081
if (!m_items.count(id))
@@ -85,6 +106,9 @@ void Hierarchy::finalize()
85106
index = item->index(index, 0);
86107
item->set_parent(0);
87108
}
109+
110+
std::cout << "Hierarchy: active items: " << index
111+
<< " / cleared items: " << m_items.size() - index << "\n";
88112
}
89113

90114
void Hierarchy::write(sqlite3pp::database &db) const

importer/src/hierarchy.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ class Hierarchy
2121
void add_item(std::shared_ptr<HierarchyItem> &item);
2222
void add_linked_item(std::shared_ptr<HierarchyItem> &item);
2323
void set_country(const std::string &country, hindex id);
24+
void cleanup();
2425
void finalize();
2526
void write(sqlite3pp::database &db) const;
2627

importer/src/hierarchyitem.cpp

Lines changed: 71 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -4,54 +4,100 @@
44
#include <iostream>
55
#include <stdexcept>
66

7+
std::set<std::string> HierarchyItem::s_priority_types;
8+
std::set<std::string> HierarchyItem::s_skip_types;
9+
10+
static std::string allowed_type_chars = "abcdefghijklmnopqrstuvwxyz_-";
11+
712
HierarchyItem::HierarchyItem(const pqxx::row &row)
813
{
914
m_id = row["place_id"].as<hindex>(0);
1015
m_linked_id = row["linked_place_id"].as<hindex>(0);
1116
m_parent_id = row["parent_place_id"].as<hindex>(0);
1217
m_country = row["country_code"].as<std::string>("");
13-
m_class = row["class"].as<std::string>("");
14-
m_type = row["type"].as<std::string>("");
18+
m_type = geocoder_type(row["class"].as<std::string>(""), row["type"].as<std::string>(""));
1519
m_housenumber = row["housenumber"].as<std::string>("");
1620
m_postcode = row["postcode"].as<std::string>("");
1721
m_latitude = row["latitude"].as<float>(0);
1822
m_longitude = row["longitude"].as<float>(0);
1923

20-
m_name = parse_to_map(row["name"].as<std::string>(""));
21-
m_extra = parse_to_map(row["extra"].as<std::string>(""));
24+
m_data_name = parse_to_map(row["name"].as<std::string>(""));
25+
m_data_extra = parse_to_map(row["extra"].as<std::string>(""));
26+
27+
m_name = get_with_def(m_data_name, "name");
28+
m_name_extra.clear();
29+
if (!m_housenumber.empty())
30+
{
31+
m_name_extra = m_name;
32+
m_name = m_housenumber;
33+
}
34+
35+
if (m_name_extra.empty())
36+
m_name_extra = get_with_def(m_data_extra, "brand");
37+
}
38+
39+
bool HierarchyItem::keep() const
40+
{
41+
if (m_type.find_first_not_of(allowed_type_chars) != std::string::npos)
42+
{
43+
std::cout << "Dropping " << m_type << "\n";
44+
return false;
45+
}
46+
if (s_skip_types.count(m_type) > 0)
47+
return false;
48+
return !m_name.empty() || s_priority_types.count(m_type) > 0;
2249
}
2350

2451
void HierarchyItem::add_child(std::shared_ptr<HierarchyItem> child)
2552
{
2653
m_children.push_back(child);
54+
child->set_parent(m_id);
2755
}
2856

2957
void HierarchyItem::add_linked(std::shared_ptr<HierarchyItem> linked)
3058
{
31-
m_name.insert(linked->m_name.begin(), linked->m_name.end());
32-
m_extra.insert(linked->m_extra.begin(), linked->m_extra.end());
33-
// if (m_class != linked->m_class || m_type != linked->m_type)
34-
// std::cout << "Mismatch between linked objects: " << m_id << ", " << m_class << ", " <<
35-
// m_type
36-
// << " -> " << linked->m_id << ", " << linked->m_class << ", " << linked->m_type
37-
// << "\n";
59+
m_data_name.insert(linked->m_data_name.begin(), linked->m_data_name.end());
60+
m_data_extra.insert(linked->m_data_extra.begin(), linked->m_data_extra.end());
3861
}
3962

40-
void HierarchyItem::set_parent(hindex parent)
63+
void HierarchyItem::set_parent(hindex parent, bool force)
4164
{
42-
if (m_parent_id != parent && m_parent_id != 0)
65+
if (!force && m_parent_id != parent && m_parent_id != 0 && parent != 0)
4366
{
4467
std::cout << "New parent (" << parent << ") for " << m_id << " does not match old one ("
4568
<< m_parent_id << ")\n";
4669
throw std::runtime_error("Mismatch between new and old parent");
4770
}
4871
m_parent_id = parent;
49-
for (auto c : m_children)
50-
c->set_parent(m_id);
72+
// for (auto c : m_children)
73+
// c->set_parent(m_id, force);
74+
}
75+
76+
void HierarchyItem::cleanup_children()
77+
{
78+
// as a result of this run, children that are supposed to be kept are staying in children
79+
// property. all disposed ones are still pointed to via Hierarchy map, but should not be accessed
80+
// while moving along hierarchy for indexing or writing it
81+
std::deque<std::shared_ptr<HierarchyItem> > children;
82+
for (auto item : m_children)
83+
{
84+
item->cleanup_children();
85+
if (item->keep())
86+
children.push_back(item);
87+
else
88+
children.insert(children.end(), item->m_children.begin(), item->m_children.end());
89+
}
90+
m_children = children;
91+
92+
// set parent, forced
93+
for (auto item : m_children)
94+
item->set_parent(m_id, true);
5195
}
5296

5397
sqlid HierarchyItem::index(sqlid idx, sqlid parent)
5498
{
99+
if (!keep())
100+
throw std::runtime_error("Trying to index a location that was not supposed to be kept");
55101
m_my_index = idx;
56102
m_parent_index = parent;
57103
++idx;
@@ -63,27 +109,20 @@ sqlid HierarchyItem::index(sqlid idx, sqlid parent)
63109

64110
void HierarchyItem::write(sqlite3pp::database &db) const
65111
{
66-
// primary data
67-
std::string name = get_with_def(m_name, "name");
68-
std::string name_en = get_with_def(m_name, "name:en");
69-
std::string phone = get_with_def(m_extra, "phone");
70-
std::string website = get_with_def(m_extra, "website");
112+
if (!keep())
113+
throw std::runtime_error("Trying to write a location that was not supposed to be kept");
71114

72-
std::string name_extra;
73-
if (!m_housenumber.empty())
74-
{
75-
name_extra = name;
76-
name = m_housenumber;
77-
}
115+
// primary data
116+
std::string name_en = get_with_def(m_data_name, "name:en");
117+
std::string phone = get_with_def(m_data_extra, "phone");
118+
std::string website = get_with_def(m_data_extra, "website");
78119

79-
if (name_extra.empty())
80-
name_extra = get_with_def(m_extra, "brand");
81120
{
82121
sqlite3pp::command cmd(db, "INSERT INTO object_primary_tmp (id, postgres_id, name, name_extra, "
83122
"name_en, phone, postal_code, website, parent, longitude, "
84123
"latitude) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)");
85-
cmd.binder() << m_my_index << (int)m_id << name << name_extra << name_en << phone << m_postcode
86-
<< website << m_parent_index << m_longitude << m_latitude;
124+
cmd.binder() << m_my_index << (int)m_id << m_name << m_name_extra << name_en << phone
125+
<< m_postcode << website << m_parent_index << m_longitude << m_latitude;
87126
if (cmd.execute() != SQLITE_OK)
88127
std::cerr << "WriteSQL : error inserting primary data for " << m_id << ", " << m_my_index
89128
<< "\n";
@@ -95,7 +134,7 @@ void HierarchyItem::write(sqlite3pp::database &db) const
95134
//= "INSERT INTO object_type_tmp (prim_id, type) VALUES (?, \"" + type + "\")";
96135
= "INSERT INTO object_type_tmp (prim_id, type) VALUES (?, ?)";
97136
sqlite3pp::command cmd(db, command.c_str());
98-
cmd.binder() << m_my_index << geocoder_type(m_class, m_type);
137+
cmd.binder() << m_my_index << m_type;
99138
if (cmd.execute() != SQLITE_OK)
100139
std::cerr << "WriteSQL: error inserting type for " << m_id << ", " << m_my_index << "\n";
101140
}
@@ -120,7 +159,7 @@ void HierarchyItem::print_branch(unsigned int offset) const
120159
std::cout << std::string(offset, ' ') << "- " << m_id << " ";
121160
if (!m_housenumber.empty())
122161
std::cout << "house " << m_housenumber << " ";
123-
for (const auto &i : m_name)
162+
for (const auto &i : m_data_name)
124163
std::cout << i.first << ": " << i.second << " ";
125164
std::cout << "(" << m_my_index << " " << m_last_child_index - m_my_index << ": " << m_parent_id
126165
<< ", " << m_country << ")\n";

importer/src/hierarchyitem.h

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include <deque>
77
#include <memory>
88
#include <pqxx/pqxx>
9+
#include <set>
910
#include <sqlite3pp.h>
1011
#include <string>
1112

@@ -19,10 +20,14 @@ class HierarchyItem
1920
hindex linked_id() const { return m_linked_id; }
2021
hindex parent_id() const { return m_parent_id; }
2122
const std::string &country() const { return m_country; }
23+
bool keep() const;
24+
25+
const std::deque<std::shared_ptr<HierarchyItem> > &children() { return m_children; }
2226

2327
void add_child(std::shared_ptr<HierarchyItem> child);
2428
void add_linked(std::shared_ptr<HierarchyItem> linked);
25-
void set_parent(hindex);
29+
void set_parent(hindex parent, bool force = false);
30+
void cleanup_children();
2631
sqlid index(sqlid idx, sqlid parent);
2732
void write(sqlite3pp::database &db) const;
2833

@@ -36,18 +41,22 @@ class HierarchyItem
3641
sqlid m_parent_index;
3742
sqlid m_last_child_index;
3843

39-
std::string m_class;
4044
std::string m_type;
4145
float m_latitude;
4246
float m_longitude;
4347
std::string m_country;
4448
std::string m_postcode;
4549
std::string m_housenumber;
50+
std::string m_name;
51+
std::string m_name_extra;
4652

47-
std::map<std::string, std::string> m_name;
48-
std::map<std::string, std::string> m_extra;
53+
std::map<std::string, std::string> m_data_name;
54+
std::map<std::string, std::string> m_data_extra;
4955

5056
std::deque<std::shared_ptr<HierarchyItem> > m_children;
57+
58+
static std::set<std::string> s_priority_types;
59+
static std::set<std::string> s_skip_types;
5160
};
5261

5362
#endif

importer/src/main.cpp

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -143,23 +143,35 @@ int main(int argc, char *argv[])
143143
}
144144
}
145145

146+
// find missing parents for root nodes
146147
std::cout << "Fill missing hierarchies. Root size: " << hierarchy.get_root_count() << "\n";
147148
for (hindex parent = hierarchy.get_next_nonzero_root_parent(); parent;)
148149
{
149-
pqxx::result r = txn.exec_params(base_query + "where place_id=$1", parent);
150+
pqxx::result r = txn.exec_params(base_query + "where place_id=$1", parent);
151+
bool found = false;
150152
for (auto row : r)
151153
{
152154
std::shared_ptr<HierarchyItem> item = std::make_shared<HierarchyItem>(row);
153155
hierarchy.add_item(item);
156+
found = true;
157+
}
158+
159+
if (!found)
160+
{
161+
std::cerr << "Missing parent with ID " << parent << ". Stopping import\n";
162+
return -1;
154163
}
155164

156165
parent = hierarchy.get_next_nonzero_root_parent();
157166
}
158167

159-
std::cout << "Try to fill missing parents through countries. Root size: "
160-
<< hierarchy.get_root_count() << "\n";
168+
// remove all items from hierarchy that are not supposed to be there
169+
std::cout << "Cleanup hierarchy\n";
170+
hierarchy.cleanup();
161171

162172
// find missing countries and move root nodes under them if possible
173+
std::cout << "Try to fill missing parents through countries. Root size: "
174+
<< hierarchy.get_root_count() << "\n";
163175
for (std::string country : hierarchy.get_root_countries())
164176
{
165177
for (auto row : txn.exec_params(
@@ -171,7 +183,7 @@ int main(int argc, char *argv[])
171183

172184
txn.commit(); // finalize postgres transactions
173185

174-
hierarchy.print(false);
186+
// hierarchy.print(false);
175187

176188
// Saving data into SQLite
177189
sqlite3pp::database db(GeoNLP::Geocoder::name_primary(database_path).c_str());
@@ -261,6 +273,22 @@ int main(int argc, char *argv[])
261273
"SELECT box_id, min(latitude), max(latitude), min(longitude), max(longitude) from "
262274
"object_primary group by box_id");
263275

276+
// Stats view
277+
db.execute("DROP VIEW IF EXISTS type_stats");
278+
db.execute(
279+
"CREATE VIEW type_stats AS SELECT t.name as type_name, COUNT(*) AS cnt FROM object_primary o "
280+
"JOIN \"type\" t ON t.id = o.type_id GROUP BY t.name ORDER BY cnt desc");
281+
{
282+
std::cout << "List of most popular imported types\n";
283+
sqlite3pp::query qry(db, "SELECT type_name, cnt FROM type_stats ORDER BY cnt DESC LIMIT 25");
284+
for (auto v : qry)
285+
{
286+
std::string name;
287+
int cnt;
288+
v.getter() >> name >> cnt;
289+
std::cout << " " << name << "\t" << cnt << "\n";
290+
}
291+
}
264292
// Recording version
265293
db.execute("DROP TABLE IF EXISTS meta");
266294
db.execute("CREATE TABLE meta (key TEXT, value TEXT)");

importer/src/utils.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ std::map<std::string, std::string> parse_to_map(const std::string &js)
2727

2828
std::string geocoder_type(const std::string &t_class, const std::string &t_value)
2929
{
30-
if (t_value == "yes")
30+
if (t_value == "yes" || t_value.empty())
3131
return t_class;
3232
return t_class + "_" + t_value;
3333
}

0 commit comments

Comments
 (0)