Skip to content

Commit 503a0b1

Browse files
committed
drop duplicates from the database
1 parent 0ec5c18 commit 503a0b1

File tree

4 files changed

+75
-12
lines changed

4 files changed

+75
-12
lines changed

importer/src/hierarchyitem.cpp

Lines changed: 59 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -25,16 +25,7 @@ HierarchyItem::HierarchyItem(const pqxx::row &row)
2525
m_data_name = parse_to_map(row["name"].as<std::string>(""));
2626
m_data_extra = parse_to_map(row["extra"].as<std::string>(""));
2727

28-
m_name = get_with_def(m_data_name, "name");
29-
m_name_extra.clear();
30-
if (!m_housenumber.empty())
31-
{
32-
m_name_extra = m_name;
33-
m_name = m_housenumber;
34-
}
35-
36-
if (m_name_extra.empty())
37-
m_name_extra = get_with_def(m_data_extra, "brand");
28+
set_names();
3829
}
3930

4031
// trim from start (in place)
@@ -103,6 +94,21 @@ bool HierarchyItem::keep() const
10394
return !m_name.empty() || s_priority_types.count(m_type) > 0;
10495
}
10596

97+
bool HierarchyItem::is_duplicate(std::shared_ptr<HierarchyItem> item) const
98+
{
99+
if (s_priority_types.count(m_type) > 0)
100+
return false;
101+
102+
if (m_name != item->m_name || m_country != item->m_country || m_postcode != item->m_postcode)
103+
return false;
104+
105+
if (m_type == item->m_type || same_starts_with("building", m_type, item->m_type)
106+
|| same_starts_with("highway", m_type, item->m_type))
107+
return true;
108+
109+
return false;
110+
}
111+
106112
void HierarchyItem::add_child(std::shared_ptr<HierarchyItem> child)
107113
{
108114
m_children.push_back(child);
@@ -113,6 +119,21 @@ void HierarchyItem::add_linked(std::shared_ptr<HierarchyItem> linked)
113119
{
114120
m_data_name.insert(linked->m_data_name.begin(), linked->m_data_name.end());
115121
m_data_extra.insert(linked->m_data_extra.begin(), linked->m_data_extra.end());
122+
set_names();
123+
}
124+
125+
void HierarchyItem::set_names()
126+
{
127+
m_name = get_with_def(m_data_name, "name");
128+
m_name_extra.clear();
129+
if (!m_housenumber.empty())
130+
{
131+
m_name_extra = m_name;
132+
m_name = m_housenumber;
133+
}
134+
135+
if (m_name_extra.empty())
136+
m_name_extra = get_with_def(m_data_extra, "brand");
116137
}
117138

118139
void HierarchyItem::set_parent(hindex parent, bool force)
@@ -144,6 +165,34 @@ void HierarchyItem::cleanup_children()
144165
}
145166
m_children = children;
146167

168+
// check for duplicates
169+
for (size_t child_index = 0; child_index < m_children.size(); ++child_index)
170+
{
171+
std::shared_ptr<HierarchyItem> item = m_children[child_index];
172+
std::deque<std::shared_ptr<HierarchyItem> > duplicates;
173+
174+
children.clear();
175+
children.insert(children.end(), m_children.begin(), m_children.begin() + child_index + 1);
176+
177+
for (size_t i = child_index + 1; i < m_children.size(); ++i)
178+
if (m_children[i]->is_duplicate(item))
179+
duplicates.push_back(m_children[i]);
180+
else
181+
children.push_back(m_children[i]);
182+
183+
// merge duplicates
184+
for (auto &i : duplicates)
185+
{
186+
item->add_linked(i);
187+
item->m_children.insert(item->m_children.end(), i->m_children.begin(),
188+
i->m_children.end());
189+
for (auto &i_children : i->m_children)
190+
i_children->set_parent(item->m_id, true);
191+
}
192+
193+
m_children = children;
194+
}
195+
147196
// set parent, forced
148197
for (auto item : m_children)
149198
item->set_parent(m_id, true);

importer/src/hierarchyitem.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@ class HierarchyItem
2020
hindex linked_id() const { return m_linked_id; }
2121
hindex parent_id() const { return m_parent_id; }
2222
const std::string &country() const { return m_country; }
23-
bool keep() const;
23+
24+
bool keep() const;
25+
bool is_duplicate(std::shared_ptr<HierarchyItem> item) const;
2426

2527
const std::deque<std::shared_ptr<HierarchyItem> > &children() { return m_children; }
2628

@@ -37,6 +39,9 @@ class HierarchyItem
3739
static void load_priority_list(const std::string &fname);
3840
static void load_skip_list(const std::string &fname);
3941

42+
protected:
43+
void set_names();
44+
4045
private:
4146
hindex m_id;
4247
hindex m_linked_id{ 0 };

importer/src/utils.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,11 @@ std::map<std::string, std::string> parse_to_map(const std::string &js)
2525
return m;
2626
}
2727

28+
bool same_starts_with(const std::string &start, const std::string &s1, const std::string &s2)
29+
{
30+
return s1.rfind(start, 0) == 0 && s2.rfind(start, 0) == 0;
31+
}
32+
2833
std::string geocoder_type(const std::string &t_class, const std::string &t_value)
2934
{
3035
if (t_value == "yes" || t_value.empty())

importer/src/utils.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,11 @@
66

77
std::string get_with_def(const std::map<std::string, std::string> &m, const std::string &key,
88
const std::string &defval = std::string());
9-
std::map<std::string, std::string> parse_to_map(const std::string &js);
9+
1010
std::string geocoder_type(const std::string &t_class, const std::string &t_value);
1111

12+
std::map<std::string, std::string> parse_to_map(const std::string &js);
13+
14+
bool same_starts_with(const std::string &start, const std::string &s1, const std::string &s2);
15+
1216
#endif

0 commit comments

Comments
 (0)