Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions docs/undate/converters.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,19 @@ Converters
Overview
--------


..note: base converter is pretty developer-facing documentation;
.. calendar/converter usage would be helpful to add

Comment on lines 7 to 11
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

RST directive syntax is malformed.

The note directive should use .. note:: (two dots, space, directive name, double colon). The current syntax will not render as a note block in Sphinx documentation.

🔎 Proposed fix
-..note: base converter is pretty developer-facing documentation; 
-.. calendar/converter usage would be helpful to add
+.. note::
+
+   Base converter is pretty developer-facing documentation;
+   calendar/converter usage would be helpful to add.
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
..note: base converter is pretty developer-facing documentation;
.. calendar/converter usage would be helpful to add
.. note::
Base converter is pretty developer-facing documentation;
calendar/converter usage would be helpful to add.
🤖 Prompt for AI Agents
In docs/undate/converters.rst around lines 7 to 10, the RST note directive is
malformed ("..note:") and won't render; change it to the correct Sphinx syntax
(".. note::") and ensure a blank line after the directive before the note
content, and also add a short example section showing calendar/converter usage
(header, brief usage snippet or steps) to make the converter guidance more
user-facing.

.. automodule:: undate.converters.base
:members:
:undoc-members:


.. autoclass:: undate.converters.combined.OmnibusDateConverter
:members:


Formats
--------

Expand Down
4 changes: 3 additions & 1 deletion src/undate/converters/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
from undate.converters.base import BaseDateConverter as BaseDateConverter
from undate.converters.base import BaseDateConverter, GRAMMAR_FILE_PATH

__all__ = ["BaseDateConverter", "GRAMMAR_FILE_PATH"]
5 changes: 5 additions & 0 deletions src/undate/converters/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@

import importlib
import logging
import pathlib
import pkgutil
from functools import cache
from typing import Dict, Type
Expand All @@ -53,6 +54,10 @@
logger = logging.getLogger(__name__)


#: Path to parser grammar files
GRAMMAR_FILE_PATH = pathlib.Path(__file__).parent / "grammars"


class BaseDateConverter:
"""Base class for parsing, formatting, and converting dates to handle
specific formats and different calendars."""
Expand Down
6 changes: 3 additions & 3 deletions src/undate/converters/calendars/hebrew/parser.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import pathlib

from lark import Lark

grammar_path = pathlib.Path(__file__).parent / "hebrew.lark"
from undate.converters import GRAMMAR_FILE_PATH

grammar_path = GRAMMAR_FILE_PATH / "hebrew.lark"

with open(grammar_path) as grammar:
# NOTE: LALR parser is faster but can't be used to ambiguity between years and dates
Expand Down
9 changes: 5 additions & 4 deletions src/undate/converters/calendars/hebrew/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,13 @@ def hebrew_date(self, items):

# initialize and return an undate with year, month, day and
# configured calendar (hebrew by default)
# NOTE: use self.calendar so Seleucid can extend more easily
return Undate(**parts, calendar=self.calendar)

# year translation is not needed since we want a tree with name year
# this is equivalent to a no-op
# def year(self, items):
# return Tree(data="year", children=[items[0]])
def year(self, items):
# combine multiple parts into a single string
value = "".join([str(i) for i in items])
return Tree(data="year", children=[value])

def month(self, items):
# month has a nested tree for the rule and the value
Expand Down
6 changes: 3 additions & 3 deletions src/undate/converters/calendars/islamic/parser.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import pathlib

from lark import Lark

grammar_path = pathlib.Path(__file__).parent / "islamic.lark"
from undate.converters import GRAMMAR_FILE_PATH

grammar_path = GRAMMAR_FILE_PATH / "islamic.lark"

with open(grammar_path) as grammar:
# NOTE: LALR parser is faster but can't be used due to ambiguity between years and days
Expand Down
13 changes: 11 additions & 2 deletions src/undate/converters/calendars/islamic/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,17 @@ def islamic_date(self, items):

# year translation is not needed since we want a tree with name year
# this is equivalent to a no-op
# def year(self, items):
# return Tree(data="year", children=[items[0]])
def year(self, items):
# combine multiple parts into a single string
# (for some reason we're getting an anonymous token in combined parser)
value = "".join([str(i) for i in items])
return Tree(data="year", children=[value])

def day(self, items):
# combine multiple parts into a single string
# (for some reason we're getting an anonymous token in combined parser)
value = "".join([str(i) for i in items])
return Tree(data="day", children=[value])

def month(self, items):
# month has a nested tree for the rule and the value
Expand Down
85 changes: 85 additions & 0 deletions src/undate/converters/combined.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
"""
**Experimental** combined parser. Supports EDTF, Hebrew, and Hijri
where dates are unambiguous. (Year-only dates are parsed as EDTF in
Gregorian calendar.)
"""

from typing import Union

from lark import Lark
from lark.exceptions import UnexpectedCharacters
from lark.visitors import Transformer, merge_transformers

from undate import Undate, UndateInterval
from undate.converters import BaseDateConverter, GRAMMAR_FILE_PATH
from undate.converters.edtf.transformer import EDTFTransformer
from undate.converters.calendars.hebrew.transformer import HebrewDateTransformer
from undate.converters.calendars.islamic.transformer import IslamicDateTransformer


class CombinedDateTransformer(Transformer):
def start(self, children):
# trigger the transformer for the appropriate part of the grammar
return children


# NOTE: currently year-only dates in combined parser are interpreted as
# EDTF and use Gregorian calendar.
# In future, we could refine by adding calendar names & abbreviations
# to the parser in order to recognize years from other calendars.

combined_transformer = merge_transformers(
CombinedDateTransformer(),
edtf=EDTFTransformer(),
hebrew=HebrewDateTransformer(),
islamic=IslamicDateTransformer(),
)


# open based on filename so we can specify relative import path based on grammar file
parser = Lark.open(
str(GRAMMAR_FILE_PATH / "combined.lark"), rel_to=__file__, strict=True
)


class OmnibusDateConverter(BaseDateConverter):
"""
Combination parser that aggregates existing parser grammars.
Currently supports EDTF, Hebrew, and Hijri where dates are unambiguous.
(Year-only dates are parsed as EDTF in Gregorian calendar.)

Does not support serialization.

Example usage::

Undate.parse("Tammuz 4816", "omnibus")

"""

#: converter name: omnibus
name: str = "omnibus"

def __init__(self):
self.transformer = combined_transformer

def parse(self, value: str) -> Union[Undate, UndateInterval]:
"""
Parse a string in a supported format and return an :class:`~undate.undate.Undate`
or :class:`~undate.undate.UndateInterval`.
"""
if not value:
raise ValueError("Parsing empty/unset string is not supported")

# parse the input string, then transform to undate object
try:
parsetree = parser.parse(value)
# transform returns a list; we want the first item in the list
return self.transformer.transform(parsetree)[0]
except UnexpectedCharacters:
raise ValueError(
"Parsing failed: '%s' is not in a recognized date format" % value
)

def to_string(self, undate: Union[Undate, UndateInterval]) -> str:
"Not supported by this converter. Will raise :class:`ValueError`"
raise ValueError("Omnibus converter does not support serialization")
6 changes: 3 additions & 3 deletions src/undate/converters/edtf/parser.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import pathlib

from lark import Lark

grammar_path = pathlib.Path(__file__).parent / "edtf.lark"
from undate.converters import GRAMMAR_FILE_PATH

grammar_path = GRAMMAR_FILE_PATH / "edtf.lark"

with open(grammar_path) as grammar:
edtf_parser = Lark(grammar.read(), start="edtf")
5 changes: 4 additions & 1 deletion src/undate/converters/edtf/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,10 @@ def day_unspecified(self, items):
def date_level1(self, items):
return self.date(items)

# year (including negative years) use default transformation
def year(self, items):
# combine parts (numeric & unknown) into a single string
value = "".join(self.get_values(items))
return Tree(data="year", children=[value])

def year_fivedigitsplus(self, items):
# strip off the leading Y and convert to integer
Expand Down
32 changes: 32 additions & 0 deletions src/undate/converters/grammars/combined.lark
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
%import common.WS
%ignore WS

start: (edtf__start | hebrew__hebrew_date | islamic__islamic_date )

// Renaming of the import variables is required, as they receive the namespace of this file.
// See: https://github.com/lark-parser/lark/pull/973#issuecomment-907287565

// All grammars are in the same file, so we can use relative imports

// relative import from edtf.lark
%import .edtf.edtf -> edtf__start

// relative import from hebrew.lark
%import .hebrew.hebrew_date -> hebrew__hebrew_date
%import .hebrew.day -> hebrew__day
%import .hebrew.month -> hebrew__month
%import .hebrew.year -> hebrew__year

// relative import from islamic.lark
%import .islamic.islamic_date -> islamic__islamic_date
%import .islamic.day -> islamic__day
%import .islamic.month -> islamic__month
%import .islamic.year -> islamic__year


// override hebrew date to omit year-only, since year without calendar is ambiguous
// NOTE: potentially support year with calendar label
%override hebrew__hebrew_date: hebrew__day hebrew__month hebrew__year | hebrew__month hebrew__year

// same for islamic date, year alone is ambiguous
%override islamic__islamic_date: islamic__day islamic__month islamic__year | islamic__month islamic__year
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,23 @@ hebrew_date: weekday? day month comma? year | month year | year
// PGP dates use qualifiers like "first decade of" (for beginning of month)
// "first third of", seasons (can look for more examples)

// Hebrew calendar starts with year 1 in 3761 BCE
// Hebrew calendar starts with year 1 in 3761 BCE
year: /\d+/

// months
month: month_1
| month_2
| month_3
| month_4
| month_5
| month_6
| month_7
| month_8
| month_9
| month_10
| month_11
| month_12
| month_13
| month_3
| month_4
| month_5
| month_6
| month_7
| month_8
| month_9
| month_10
| month_11
| month_12
| month_13
// months have 29 or 30 days; we do not expect leading zeroes
day: /[1-9]/ | /[12][0-9]/ | /30/

Expand Down
45 changes: 45 additions & 0 deletions tests/test_converters/test_combined_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import pytest

from undate.converters.combined import parser, combined_transformer

from undate import Undate, UndateInterval

# test that valid dates can be parsed

testcases = [
# EDTF
("1984", Undate(1984)),
("201X", Undate("201X")),
("20XX", Undate("20XX")),
("2004-XX", Undate(2004, "XX")),
("1000/2000", UndateInterval(Undate(1000), Undate(2000))),
# Hebrew / Anno Mundi calendar
("Tammuz 4816", Undate(4816, 4, calendar="Hebrew")),
# Islamic / Hijri calendar
("Jumādā I 1243", Undate(1243, 5, calendar="Islamic")),
("7 Jumādā I 1243", Undate(1243, 5, 7, calendar="Islamic")),
("14 Rabīʿ I 901", Undate(901, 3, 14, calendar="Islamic")),
]


@pytest.mark.parametrize("date_string,expected", testcases)
def test_transform(date_string, expected):
# test the transformer directly
transformer = combined_transformer
# parse the input string, then transform to undate object
parsetree = parser.parse(date_string)
# since the same unknown date is not considered strictly equal,
# compare object representations
transformed_date = transformer.transform(parsetree)
assert repr(transformed_date[0]) == repr(expected)


@pytest.mark.parametrize("date_string,expected", testcases)
def test_converter(date_string, expected):
# should work the same way when called through the converter class
assert repr(Undate.parse(date_string, "omnibus")) == repr(expected)


def test_no_serialize():
with pytest.raises(ValueError, match="does not support"):
Undate("2022").format("omnibus")
Loading