diff --git a/docs/api.rst b/docs/api.rst index c40945c..9bd1154 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -151,6 +151,20 @@ Functions: eskin.abctools_url_to_abc eskin.abc_to_abctools_url +Bill Black +---------- + +.. automodule:: pyabc2.sources.bill_black + +Functions: + +.. currentmodule:: pyabc2.sources + +.. autosummary:: + :toctree: api/ + + bill_black.load_meta + abcjs tools =========== diff --git a/docs/examples/sources.ipynb b/docs/examples/sources.ipynb index 259debe..d807252 100644 --- a/docs/examples/sources.ipynb +++ b/docs/examples/sources.ipynb @@ -17,7 +17,8 @@ "metadata": {}, "outputs": [], "source": [ - "from pyabc2.sources import load_example, norbeck, the_session, eskin" + "from pyabc2 import Tune\n", + "from pyabc2.sources import load_example, norbeck, the_session, eskin, bill_black" ] }, { @@ -401,8 +402,6 @@ "metadata": {}, "outputs": [], "source": [ - "from pyabc2 import Tune\n", - "\n", "Tune(df.query(\"group == 'jigs'\").iloc[0].abc)" ] }, @@ -419,6 +418,38 @@ "display(Markdown(f\"<{url}>\"))\n", "eskin.load_url(url)" ] + }, + { + "cell_type": "markdown", + "id": "36", + "metadata": {}, + "source": [ + "## Bill Black\n", + "\n", + "Bill Black has an extensive ABC library, available at .\n", + "We can load all of the tune blocks (strings) with {func}`pyabc2.sources.bill_black.load_meta`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37", + "metadata": {}, + "outputs": [], + "source": [ + "abcs = bill_black.load_meta()\n", + "len(abcs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38", + "metadata": {}, + "outputs": [], + "source": [ + "Tune(abcs[0])" + ] } ], "metadata": { diff --git a/pyabc2/sources/bill_black.py b/pyabc2/sources/bill_black.py new file mode 100644 index 0000000..d907062 --- /dev/null +++ b/pyabc2/sources/bill_black.py @@ -0,0 +1,197 @@ +""" +Bill Black's Irish Traditional Tune Library + +http://www.capeirish.com/ittl/ +""" + +import logging +import re +from pathlib import Path + +from pyabc2._util import get_logger as _get_logger + +logger = _get_logger(__name__) + +HERE = Path(__file__).parent + +SAVE_TO = HERE / "_bill-black" +TXT_FNS = [ + "a-tunes-1.txt", + "b-tunes-1.txt", + "c-tunes-1.txt", + "d-tunes-1.txt", + "e-tunes-1.txt", + "f-tunes-1.txt", + "g-tunes-1.txt", + "h-tunes-1.txt", + "i-tunes-1.txt", + "j-tunes-1.txt", + "k-tunes-1.txt", + "l-tunes-1.txt", + "m-tunes-1.txt", + "n-tunes-1.txt", + "o-tunes-1.txt", + "pq-tunes-1.txt", + "r-tunes-1.txt", + "s-tunes-2.txt", + "t-tunes-1.txt", + "uv-tunes-1.txt", + "wz-tunes-1.txt", +] + + +def download() -> None: + """Download the alphabetical text files from http://www.capeirish.com/ittl/alltunes/text/ + and store them in a compressed archive. + """ + import zipfile + from concurrent.futures import ThreadPoolExecutor + + import requests + + def download_one(url): + r = requests.get(url, headers={"User-Agent": "pyabc2"}, timeout=5) + r.raise_for_status() + return r.text + + with ThreadPoolExecutor(max_workers=4) as executor: + futures = [] + for fn in TXT_FNS: + url = f"http://www.capeirish.com/ittl/alltunes/text/{fn}" + futures.append(executor.submit(download_one, url)) + + SAVE_TO.mkdir(exist_ok=True) + + with zipfile.ZipFile( + SAVE_TO / "bill_black_alltunes_text.zip", + "w", + compression=zipfile.ZIP_DEFLATED, + ) as zf: + for fn, future in zip(TXT_FNS, futures, strict=True): + text = future.result() + zf.writestr(fn, text) + + +def load_meta(*, redownload: bool = False, debug: bool = False) -> list[str]: + """Load all data, splitting into ABC tune blocks and removing lines that start with ``%``. + + Parameters + ---------- + redownload + Re-download the data file. + debug + Show debug messages. + """ + import zipfile + from collections import Counter + from textwrap import indent + + if debug: # pragma: no cover + logger.setLevel(logging.DEBUG) + else: + logger.setLevel(logging.NOTSET) + + zip_path = SAVE_TO / "bill_black_alltunes_text.zip" + if redownload or not zip_path.is_file(): + print("downloading...", end=" ", flush=True) + download() + print("done") + + abcs = [] + with zipfile.ZipFile(zip_path, "r") as zf: + for zi in zf.filelist: + fn = zi.filename + logger.debug(f"Loading {fn!r}") + with zf.open(zi, "r") as f: + text = f.read().decode("utf-8") + + # A tune block starts with the X: line and ends with a blank line + # or the end of the file. + # Unlike the RTF files, %%% is not _necessarily_ present as a tune separator. + + # Remove all lines that start with % + text = "\n".join( + line.strip() for line in text.splitlines() if not line.lstrip().startswith("%") + ) + + # Find the start of the first tune, in order to skip header info + start = text.find("X:") + if start == -1: # pragma: no cover + raise RuntimeError(f"Unable to find first tune in Bill Black file {fn!r}") + + text = text[start:] + + # Separate some two-tune blocks + # These X vals have a tune above them without an empty line in between + if fn.startswith("c-tunes"): + to_sep = [253, 666] + elif fn.startswith("d-tunes"): + to_sep = [223] + elif fn.startswith("e-tunes"): + to_sep = [34] + else: + to_sep = [] + for n in to_sep: + text = text.replace(f"X:{n}", f"\nX:{n}") + + expected_num = text.count("X:") + + blocks = re.split(r"\n{2,}", text.rstrip()) + this_abcs = [] + for block in blocks: + block = block.strip() + if not block: # pragma: no cover + continue + + if block.startswith(":313\nT:GRAVEL WALK (reel) (1), The"): + block = "X" + block + expected_num += 1 + + if not block.startswith("X:"): + # First look for tune later in the block + # Some blocks start with comment text, sometimes including other settings but without `X:` + start = block.find("X:") + if start != -1: + block = block[start:] + else: + logger.info(f"skipping non-tune block in {fn!r}:\n{indent(block, '| ')}") + continue + + if block.count("X:") > 1: # pragma: no cover + logger.warning(f"multiple X: lines in block in {fn!r}:\n{indent(block, '| ')}") + + this_abcs.append(block) + + actual_num = len(this_abcs) + if actual_num != expected_num: # pragma: no cover + logger.warning(f"expected {expected_num} tunes in {fn!r}, but found {actual_num}") + + # Drop fully duplicate tune blocks while preserving order + seen = set() + this_abcs_unique = [] + for block in this_abcs: + if block not in seen: + seen.add(block) + this_abcs_unique.append(block) + if len(this_abcs_unique) < len(this_abcs): + logger.info( + f"removed {len(this_abcs) - len(this_abcs_unique)}/{len(this_abcs)} fully duplicate " + f"tune blocks in {fn!r}" + ) + this_abcs = this_abcs_unique + + x_counts = Counter(block.splitlines()[0] for block in this_abcs) + x_count_counts = Counter(x_counts.values()) + if set(x_count_counts) != {1}: + s_counts = ", ".join(f"{m} ({n})" for m, n in sorted(x_count_counts.items())) + logger.info(f"non-unique X vals in {fn!r}: {s_counts}") + + abcs.extend(this_abcs) + + return abcs + + +if __name__ == "__main__": # pragma: no cover + tunes = load_meta(debug=True) + print() + print(tunes[0]) diff --git a/pyabc2/sources/bill_black_tunefolders.py b/pyabc2/sources/bill_black_tunefolders.py new file mode 100644 index 0000000..0d45b3c --- /dev/null +++ b/pyabc2/sources/bill_black_tunefolders.py @@ -0,0 +1,274 @@ +""" +Bill Black's Irish Traditional Tune Library + +http://www.capeirish.com/ittl/ + +As of the 2025-06-14 update, the "tunefolders" method is deprecated. +Bill Black is now using the Eskin ABC Tools (http://www.capeirish.com/ittl/alltunes/html/), +while also posting ABC text files (http://www.capeirish.com/ittl/alltunes/text/), +both split up alphabetically by tune name. +""" + +import logging +from collections.abc import Iterable +from dataclasses import dataclass, field +from pathlib import Path + +from pyabc2._util import get_logger as _get_logger + +logger = _get_logger(__name__) + +HERE = Path(__file__).parent + +ITTL = "http://www.capeirish.com/ittl/" +SAVE_TO = HERE / "_bill-black_tunefolders" + + +@dataclass +class Collection: + key: str + title: str + folder: str + subfolders: list[str] = field(default_factory=list) + urls: list[str] = field(default_factory=list) + + @property + def abc_urls(self) -> list[str]: + if self.urls: + return self.urls + else: + num = self.folder + if self.subfolders: + return [ + f"{ITTL}tunefolders/{num}/{subfolder}/{subfolder}-ABC.rtf" + for subfolder in self.subfolders + ] + else: + return [f"{ITTL}tunefolders/{num}/{num}-ABC.rtf"] + + @property + def files(self) -> list[Path]: + return [self.url_to_file(url) for url in self.abc_urls] + + def url_to_file(self, url: str) -> Path: + """Convert a URL to a file path.""" + return SAVE_TO / f"{url.split('/')[-1]}.gz" + + +COLLECTIONS: list[Collection] = [ + Collection( + key="aif", + title="Allan's Irish Fiddler", + folder="11", + ), + Collection( + key="bbmg", + title="BB's Mostly Gems", + folder="12", + subfolders=["12-AE", "12-FJ", "12-KQ", "12-RST", "12-UY"], + ), + Collection( + key="bs", + title="Bulmer & Sharpley", + folder="13", + subfolders=["13-hps", "13-jigs", "13-misc", "13-p&s", "13-reels", "13-sjigs"], + ), + Collection( + key="car", + title="Carolan Tunes", + folder="14", + ), + Collection( + key="cre", + title="Ceol Rince na hÉireann", + folder="18", + subfolders=["18-hornpipes", "18-jigs", "18-polkas_slides", "18-reels", "18-slipjigs"], + ), + Collection( + key="dmi", + title="Dance Music of Ireland", + folder="21", + subfolders=["hps", "jigs", "reels", "slipjigs"], + ), + Collection( + key="dmwc", + title="Dance Music of Willie Clancy", + folder="22", + subfolders=["22-hps", "22-jigs", "22-misc", "22-reels", "22-sjigs"], + ), + Collection( + key="foinn", + title="Foinn Seisiún", + folder="25", + subfolders=["hps", "jigs", "misc", "p&s", "reels"], + ), + Collection( + key="jol", + title="Johnny O'Leary of Sliabh Luachra", + folder="31", + subfolders=["31-hps", "31-jigs", "31-misc", "31-polkas", "31-reels", "31-slides"], + ), + Collection( + key="levey", + title="Levey Collection", + folder="33", + subfolders=["33-hps", "33-jigs", "33-marches", "33-reels", "33-sjigs"], + ), + Collection( + key="ofpc", + title="O'Farrell's Pocket Companion", + folder="48", + subfolders=[ + "48-hps", + "48-jigs", + "48-marches", + "48-misc", + "48-polkas", + "48-reels", + "48-sjigs", + ], + ), + Collection( + key="moi", + title="Music of Ireland", + folder="49", + subfolders=[ + "491-airs", + "492-hps", + "493-jigs", + "494-misc", + "495-reels", + "496-sjigs", + "497-arr", + ], + ), + Collection( + key="roche", + title="Roche Collection", + folder="53", + subfolders=["53-hps", "53-jigs", "53-misc", "53-polkas", "53-reels", "53-sjigs"], + ), +] + +_KEY_TO_COLLECTION = {c.key: c for c in COLLECTIONS} + + +def get_collection(key: str) -> Collection: + """Get a collection by key.""" + try: + return _KEY_TO_COLLECTION[key] + except KeyError as e: + raise ValueError( + f"Unknown collection key: {key!r}. " + f"Valid keys are: {sorted(c.key for c in COLLECTIONS)}." + ) from e + + +def download(key: str | Iterable[str] | None = None) -> None: + import gzip + + import requests + + SAVE_TO.mkdir(exist_ok=True) + + if key is None: # pragma: no cover + collections = COLLECTIONS + elif isinstance(key, str): + collections = [get_collection(key)] + else: # pragma: no cover + collections = [get_collection(k) for k in key] + + for collection in collections: + for url in collection.abc_urls: + p = collection.url_to_file(url) + logger.info(f"Downloading {url} to {p.relative_to(HERE).as_posix()}") + r = requests.get(url, headers={"User-Agent": "pyabc2"}, timeout=5) + r.raise_for_status() + + # Extract filename from URL and append .gz + with gzip.open(p, "wb") as f: + f.write(r.content) + + +def load_meta(key: str, *, redownload: bool = False, debug: bool = False) -> list[str]: + """Load the tunebook data, no parsing. + + Parameters + ---------- + redownload + Re-download the data file. + debug + Show debug messages. + """ + import gzip + import re + + if debug: # pragma: no cover + logger.setLevel(logging.DEBUG) + else: + logger.setLevel(logging.NOTSET) + + collection = get_collection(key) + if redownload or any(not p.is_file() for p in collection.files): + print("downloading...", end=" ", flush=True) + download(key=collection.key) + print("done") + + abcs = [] + for p in collection.files: + logger.debug(f"Loading {p.name}") + with gzip.open(p, "rt") as f: + text = f.read() + + # Replace \\\n with just \n + text = text.replace("\\\n", "\n") + + # Continuation + text = text.replace("\\\\", "\\") + + # A tune block starts with the X: line and ends with a %%% line + # or the end of the file. + + # Find the start of the first tune, in order to skip header info + start = text.find("X:") + if start == -1: # pragma: no cover + raise RuntimeError(f"Could not find start of tune in {p.name}") + + # Split on 3 or more % + blocks = re.split(r"\s*%{3,}\s*", text[start:]) + if not blocks: # pragma: no cover + raise RuntimeError(f"Splitting blocks failed for {p.name}") + + good_blocks = [] + for i, block in enumerate(blocks): + if not block.strip(): + logger.debug(f"Empty block {i} in {p.name}") + continue + + if re.fullmatch(r"[0-9]+ deleted", block) is not None: + logger.debug(f"Tune in block {i} in {p.name} marked as deleted: {block!r}") + continue + + if not block.startswith("X:"): + logger.debug(f"Block {i} in {p.name} does not start with `X:`: {block!r}") + continue + + # Remove anything that may be after the final bar symbol + j = max(block.rfind("]"), block.rfind("|")) + assert j != -1 + good_blocks.append(block[: j + 1]) + if j < len(block) - 1: + logger.info( + f"Block {i} in {p.name} has trailing data after the final bar symbol " + f"that will be ignored: {block[j+1:]!r}" + ) + + abcs.extend(good_blocks) + + return abcs + + +if __name__ == "__main__": # pragma: no cover + abcs = load_meta("aif", debug=True) + print() + print(abcs[0]) diff --git a/tests/test_sources.py b/tests/test_sources.py index ff35b27..84b86d4 100644 --- a/tests/test_sources.py +++ b/tests/test_sources.py @@ -6,6 +6,8 @@ from pyabc2 import Key from pyabc2.parse import Tune from pyabc2.sources import ( + bill_black, + bill_black_tunefolders, eskin, examples, load_example, @@ -412,3 +414,63 @@ def test_eskin_abc_url_creation(): def test_eskin_invalid_tunebook_key(): with pytest.raises(ValueError, match="Unknown Eskin tunebook key: 'asdf'"): _ = eskin.get_tunebook_info("asdf") + + +def test_bill_black_no_https(): + # If the site does get HTTPS, we'd like to know + import requests + + url = "http://www.capeirish.com/ittl/tunefolders/" + url_https = url.replace("http://", "https://") + + r = requests.head(url, headers={"User-Agent": "pyabc2"}, timeout=5) + r.raise_for_status() + + with pytest.raises(requests.exceptions.SSLError): + r = requests.head(url_https, headers={"User-Agent": "pyabc2"}, timeout=5) + r.raise_for_status() + + +@pytest.mark.parametrize("key", list(bill_black_tunefolders._KEY_TO_COLLECTION)) +def test_bill_black_tunefolders(key): + import requests + + col = bill_black_tunefolders.get_collection(key) + if int(col.folder) in {14, 18, 21, 25, 49}: + # 14, 18, 25 -- These only have .txt now, not .rtf + # 21 -- some subfolder names don't match the file names + # 49 -- has subsubfolders + with pytest.raises(requests.exceptions.HTTPError) as e: + lst = bill_black_tunefolders.load_meta(key) + assert e.value.response.status_code == 404 + return + else: + lst = bill_black_tunefolders.load_meta(key) + + assert len(lst) > 0 + + +def test_bill_black_tunefolders_invalid_key(): + with pytest.raises(ValueError, match="Unknown collection key: 'asdf'"): + _ = bill_black_tunefolders.get_collection("asdf") + + +def test_bill_black_text_fns(): + import requests + + url = "http://www.capeirish.com/ittl/alltunes/text/" + r = requests.get(url, headers={"User-Agent": "pyabc2"}, timeout=5) + r.raise_for_status() + + fns_web = sorted(re.findall(r'href=["\']([a-z0-9\-]+\.txt)["\']', r.text)) + if "s-tunes-1.txt" in fns_web: + # We're using s-tunes-2, not both + fns_web.remove("s-tunes-1.txt") + + assert bill_black.TXT_FNS == fns_web + + +def test_bill_black_load(): + lst = bill_black.load_meta() + assert len(lst) > 0 + assert lst[0].startswith("X:")