Skip to content

Commit 8bf34c9

Browse files
committed
create flatenned output for details (close #283)
1 parent 7b0e1a3 commit 8bf34c9

File tree

6 files changed

+471
-3
lines changed

6 files changed

+471
-3
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22

33
## Unreleased
44

5+
## [v1.11.3] - dt
6+
7+
- Add [#283]: parsed.Flatten function to generate 'flat' version of parsed data.
58
- Add: update upload-artifacts in Actions to v4.
69
- Add [#279]: add `unr.` rank.
710

@@ -510,6 +513,7 @@
510513
This document follows [changelog guidelines]
511514

512515

516+
[v1.11.3]: https://github.com/gnames/gnparser/compare/v1.11.2...v1.11.3
513517
[v1.11.2]: https://github.com/gnames/gnparser/compare/v1.11.1...v1.11.2
514518
[v1.11.1]: https://github.com/gnames/gnparser/compare/v1.11.0...v1.11.1
515519
[v1.11.0]: https://github.com/gnames/gnparser/compare/v1.10.4...v1.11.0

ent/parsed/flatten.go

Lines changed: 345 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,345 @@
1+
package parsed
2+
3+
import "strings"
4+
5+
// ParsedFlat is the result of a scientific name-string parsing flattened
6+
// for the convenience.
7+
type ParsedFlat struct {
8+
// Parsed is false if parsing did not succeed.
9+
Parsed bool `json:"parsed"`
10+
11+
// NomCode modifies parsing rules according to provided nomenclatural code.
12+
NomCode string `json:"nomenclaturalCode,omitempty"`
13+
14+
// ParseQuality is a number that represents the quality of the
15+
// parsing.
16+
//
17+
// 0 - name-string is not parseable
18+
// 1 - no parsing problems encountered
19+
// 2 - small parsing problems
20+
// 3 - serious parsing problems
21+
// 4 - severe problems, name could not be parsed completely
22+
//
23+
// The ParseQuality is equal to the quality of the most
24+
// severe warning (see qualityWarnings). If no problems
25+
// are encountered, and the parsing succeeded, the parseQuality
26+
// is set to 1. If parsing failed, the parseQuality is 0.
27+
ParseQuality int `json:"quality"`
28+
29+
// Verbatim is input name-string without modifications.
30+
Verbatim string `json:"verbatim"`
31+
32+
// Normalized is a normalized version of the input name-string.
33+
Normalized string `json:"normalized,omitempty"`
34+
35+
// CanonicalSimple is a simplified version of a name where some elements like ranks,
36+
// or hybrid signs "×" are omitted (hybrid signs are present for hybrid
37+
// formulas).
38+
//
39+
// It is most useful to match names in general.
40+
CanonicalSimple string `json:"canonicalSimple,omitempty"`
41+
42+
// CanonicalFull is a canonical form that keeps hybrid signs "×" for named
43+
// hybrids and shows infra-specific ranks.
44+
//
45+
// It is most useful for detection of the best matches from
46+
// multiple results. It is also recommended for displaying
47+
// canonical forms of botanical names.
48+
CanonicalFull string `json:"canonicalFull,omitempty"`
49+
50+
// CanonicalStemmed is the most "normalized" and simplified version of the name.
51+
// Species epithets are stripped of suffixes, "j" character converted to "i",
52+
// "v" character converted to "u" according to "Schinke R, Greengrass M,
53+
// Robertson AM and Willett P (1996)"
54+
//
55+
// It is most useful to match names when a variability in suffixes is
56+
// possible.
57+
CanonicalStemmed string `json:"canonicalStemmed,omitempty"`
58+
59+
// Cardinality allows to sort, partition names according to number of
60+
// elements in their canonical forms.
61+
//
62+
// 0 - cardinality cannot be calculated
63+
// 1 - uninomial
64+
// 2 - binomial
65+
// 3 - trinomial
66+
// 4 - quadrinomial
67+
Cardinality int `json:"cardinality"`
68+
69+
// Rank provides information about the rank of the name. It is not
70+
// always possible to infer rank correctly, so this field will be
71+
// omitted when the data for it does not exist.
72+
Rank string `json:"rank,omitempty"`
73+
74+
// Authorship is the verbatim authorship of the name.
75+
Authorship string `json:"authorship,omitempty"`
76+
77+
// Bacteria is not nil if the input name has a genus
78+
// that is registered as bacterial. Possible
79+
// values are "maybe" - if the genus has homonyms in other groups
80+
// and "yes" if GNparser dictionary does not detect any homonyms
81+
//
82+
// The bacterial names often contain strain information which are
83+
// not parseable and are placed into the "tail" field.
84+
Bacteria string `json:"bacteria,omitempty"`
85+
86+
// Candidatus indicates that the parsed string is a candidatus bacterial name.
87+
Candidatus bool `json:"candidatus,omitempty"`
88+
89+
// Virus is set to true in case if name is not parsed, and probably
90+
// belongs to a wide variety of sub-cellular entities like
91+
//
92+
// - viruses
93+
// - plasmids
94+
// - prions
95+
// - RNA
96+
// - DNA
97+
//
98+
// Viruses are the vast majority in this group of names,
99+
// as a result they gave (very imprecise) name to
100+
// the field.
101+
//
102+
// We do plan to create a parser for viruses at some point,
103+
// which will expand this group into more precise categories.
104+
Virus bool `json:"virus,omitempty"`
105+
106+
// Cultivar is true if a name was parsed as a cultivar.
107+
Cultivar bool `json:"cultivar,omitempty"`
108+
109+
// DaggerChar if true if a name-string includes '†' rune.
110+
// This rune might mean a fossil, or be indication of the clade extinction.
111+
DaggerChar bool `json:"daggerChar,omitempty"`
112+
113+
// Hybrid is a string representation of a hybrid type.
114+
//
115+
// - a non-categorized hybrid
116+
// - named hybrid
117+
// - notho- hybrid
118+
// - hybrid formula
119+
Hybrid string `json:"hybrid,omitempty"`
120+
121+
// GraftChimera is a string representation of graft chimera.
122+
//
123+
// - a non-categorized graft chimera
124+
// - named graft chimera
125+
// - graft chimera formula
126+
GraftChimera string `json:"graftchimera,omitempty"`
127+
128+
// Surrogate is a string repsresentation of a surrogate type.
129+
130+
// - a non-categorized surrogates
131+
// - surrogate names from BOLD project
132+
// - comparisons (Homo cf. sapiens)
133+
// - approximations (names for specimen that not fully identified)
134+
Surrogate string `json:"surrogate,omitempty"`
135+
136+
// Tail is an unparseable tail of a name. It might contain "junk",
137+
// annotations, malformed parts of a scientific name, taxonomic concept
138+
// indications, bacterial strains etc. If there is an unparseable tail, the
139+
// quality of the name-parsing is set to the worst category.
140+
Tail string `json:"tail,omitempty"`
141+
142+
// Uninomial represents the single name used for uninomial nomenclature,
143+
// typically applied to higher taxonomic ranks (e.g., family or order names
144+
// like "Asteraceae"). This field is populated only for uninomial names and
145+
// omitted otherwise.
146+
Uninomial string `json:"uninomial,omitempty"`
147+
148+
// Genus specifies the genus part of a binomial or trinomial scientific name
149+
// (e.g., "Quercus" in "Quercus robur"). This field is empty if the name is
150+
// uninomial.
151+
Genus string `json:"genus,omitempty"`
152+
153+
// Subgenus indicates the infrageneric epithet when present.
154+
// This field is omitted if not applicable.
155+
Subgenus string `json:"infragenericEpithet,omitempty"`
156+
157+
// Species is the specific epithet of a binomial or trinomial.
158+
Species string `json:"specificEpithet,omitempty"`
159+
160+
// Infraspecies is the infraspecificEpither of trinomials (names with
161+
// cardinality 3). We do not provide details for names with higher
162+
// cardinality.
163+
Infraspecies string `json:"infraspecificEpithet,omitempty"`
164+
165+
// CultivarEpithet contains the cultivar name for cultivated plant varieties
166+
// (e.g., "Golden Delicious" in "Malus domestica 'Golden Delicious'"). This
167+
// field is populated only for names that include a cultivar designation.
168+
CultivarEpithet string `json:"cultivarEpithet,omitempty"`
169+
170+
// Notho denotes the hybrid status of a name, indicating whether it is a
171+
// hybrid (e.g., "nothosubsp." or "nothovar." in "Salvia × sylvestris"). This
172+
// field is empty if not given.
173+
Notho string `json:"notho,omitempty"`
174+
175+
// CombinationAuthorship provides the authorship for the current combination
176+
// of the name, typically the authors who transferred the species to a new
177+
// genus. (e.g., "K." in "Aus bus (L.) K."). This field is
178+
// omitted if no combination authorship is specified.
179+
CombinationAuthorship string `json:"combinationAuthorship,omitempty"`
180+
181+
// CombinationExAuthorship captures the "ex" part of the combination
182+
// authorship (e.g., "ex DC." in "Quercus robur L. ex DC."). This field is
183+
// empty if no "ex" authorship exists.
184+
CombinationExAuthorship string `json:"combinationExAuthorship,omitempty"`
185+
186+
// CombinationAuthorshipYear records the year associated with the combination
187+
// authorship, if provided (e.g., "1754" in "Homo sapiens (L.) K. 1753").
188+
// This field is omitted if the year is not specified.
189+
CombinationAuthorshipYear string `json:"combinationAuthorshipYear,omitempty"`
190+
191+
// BasionymAuthorship identifies the authorship of the original combination
192+
// of the name (e.g., "Mill." in "Quercus robur (Mill.) L." where Mill. is
193+
// the original author). This field is populated only if basionym authorship
194+
// is present.
195+
BasionymAuthorship string `json:"basionymAuthorship,omitempty"`
196+
197+
// BasionymExAuthorship specifies the "ex" part of the basionym authorship,
198+
// if applicable (e.g., "ex Torr." in "Pinus ponderosa Douglas ex Torr.").
199+
// This field is empty when no "ex" basionym authorship is provided.
200+
BasionymExAuthorship string `json:"basionymExAuthorship,omitempty"`
201+
202+
// BasionymAuthorshipYear indicates the year tied to the basionym authorship
203+
// (e.g., "1820" in "Pinus ponderosa Douglas, 1820"). This field is included
204+
// only when the basionym year is explicitly stated.
205+
BasionymAuthorshipYear string `json:"basionymAuthorshipYear,omitempty"`
206+
207+
// VerbatimID is a UUID v5 generated from the verbatim value of the
208+
// input name-string. Every unique string always generates the same
209+
// UUID.
210+
VerbatimID string `json:"id"`
211+
212+
// ParserVersion is the version number of the GNparser.
213+
ParserVersion string `json:"parserVersion"`
214+
}
215+
216+
// Flatten converts a Parsed struct into a ParsedFlat struct, which is a
217+
// flattened representation of the parsed data.
218+
func (p Parsed) Flatten() ParsedFlat {
219+
var bact string
220+
if p.Bacteria != nil && p.Bacteria.Valid {
221+
switch p.Bacteria.Value {
222+
case 0:
223+
bact = "maybe"
224+
case 1:
225+
bact = "yes"
226+
default:
227+
bact = "no"
228+
}
229+
}
230+
var hybrid string
231+
if p.Hybrid != nil {
232+
hybrid = p.Hybrid.String()
233+
}
234+
var graft string
235+
if p.GraftChimera != nil {
236+
graft = p.GraftChimera.String()
237+
}
238+
var surrogate string
239+
if p.Surrogate != nil {
240+
surrogate = p.Surrogate.String()
241+
}
242+
243+
res := ParsedFlat{
244+
Parsed: p.Parsed,
245+
NomCode: p.NomCode,
246+
ParseQuality: p.ParseQuality,
247+
Verbatim: p.Verbatim,
248+
Normalized: p.Normalized,
249+
Cardinality: p.Cardinality,
250+
Rank: p.Rank,
251+
Bacteria: bact,
252+
Candidatus: p.Candidatus,
253+
Virus: p.Virus,
254+
Cultivar: p.Cultivar,
255+
DaggerChar: p.DaggerChar,
256+
Hybrid: hybrid,
257+
GraftChimera: graft,
258+
Surrogate: surrogate,
259+
Tail: p.Tail,
260+
VerbatimID: p.VerbatimID,
261+
ParserVersion: p.ParserVersion,
262+
}
263+
if !p.Parsed {
264+
return res
265+
}
266+
267+
res.CanonicalSimple = p.Canonical.Simple
268+
res.CanonicalFull = p.Canonical.Full
269+
res.CanonicalStemmed = p.Canonical.Stemmed
270+
271+
if p.Authorship != nil {
272+
au := p.Authorship
273+
res.Authorship = au.Verbatim
274+
275+
if au.Original != nil {
276+
res.BasionymAuthorship = authorship(au.Original)
277+
res.BasionymExAuthorship = exAuthorship(au.Original)
278+
res.BasionymAuthorshipYear = year(au.Original)
279+
}
280+
281+
if au.Combination != nil {
282+
res.CombinationAuthorship = authorship(au.Combination)
283+
res.CombinationExAuthorship = exAuthorship(au.Combination)
284+
res.CombinationAuthorshipYear = year(au.Combination)
285+
}
286+
}
287+
288+
switch detail := p.Details.(type) {
289+
case DetailsUninomial:
290+
res.Uninomial = detail.Uninomial.Value
291+
case DetailsSpecies:
292+
res.Genus = detail.Species.Genus
293+
res.Subgenus = detail.Species.Subgenus
294+
res.Species = detail.Species.Species
295+
case DetailsInfraspecies:
296+
if len(detail.Infraspecies.Infraspecies) == 1 {
297+
res.Genus = detail.Infraspecies.Genus
298+
res.Species = detail.Infraspecies.Species.Species
299+
res.Rank = detail.Infraspecies.Infraspecies[0].Rank
300+
res.Infraspecies = detail.Infraspecies.Infraspecies[0].Value
301+
}
302+
res.BasionymAuthorshipYear = year(detail.Infraspecies.Authorship.Original)
303+
}
304+
return res
305+
}
306+
307+
func authorship(ag *AuthGroup) string {
308+
if ag == nil {
309+
return ""
310+
}
311+
return joinAuthors(ag.Authors)
312+
}
313+
314+
func joinAuthors(aus []string) string {
315+
var res string
316+
switch len(aus) {
317+
case 0:
318+
res = ""
319+
case 1:
320+
res = aus[0]
321+
case 2:
322+
res = strings.Join(aus, " & ")
323+
default:
324+
res = strings.Join(aus[0:len(aus)-1], ", ")
325+
res = res + " & " + aus[len(aus)-1]
326+
}
327+
return res
328+
}
329+
330+
func exAuthorship(ag *AuthGroup) string {
331+
if ag == nil || ag.ExAuthors == nil {
332+
return ""
333+
}
334+
return joinAuthors(ag.ExAuthors.Authors)
335+
}
336+
337+
func year(ag *AuthGroup) string {
338+
if ag == nil || ag.Year == nil {
339+
return ""
340+
}
341+
if ag.Year.IsApproximate {
342+
return "(" + ag.Year.Value + ")"
343+
}
344+
return ag.Year.Value
345+
}

0 commit comments

Comments
 (0)