diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 661fb3f..0e1335f 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -5,35 +5,35 @@ name: Python package on: push: - branches: [ master ] + branches: [master] pull_request: - branches: [ master ] + branches: [master] jobs: build: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 strategy: matrix: - python-version: [3.6, 3.7, 3.8] + python-version: ["3.10", "3.11", "3.12", "3.13"] steps: - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install flake8 pytest - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - - name: Lint with flake8 - run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - name: Test with pytest - run: | - pytest + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pytest diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 4e1ef42..1b7b1c5 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -13,11 +13,11 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: - python-version: '3.x' + python-version: '3.13' - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/readme.md b/readme.md index 35ecb9d..0cae407 100644 --- a/readme.md +++ b/readme.md @@ -2,7 +2,7 @@ A python project which downloads words from English Wiktionary ([en.wiktionary.org](https://en.wiktionary.org)) and parses articles' content in an easy to use JSON format. Right now, it parses etymologies, definitions, pronunciations, examples, audio links and related words. -Note: This project will not be maintained since there are many free dictionary APIs now, please see - https://dictionaryapi.dev/ for example +There are many free dictionary APIs nowadays which may or may not make this project redundant for you, do check out https://dictionaryapi.dev, for example. [![Downloads](http://pepy.tech/badge/wiktionaryparser)](http://pepy.tech/project/wiktionaryparser) @@ -29,7 +29,7 @@ Note: This project will not be maintained since there are many free dictionary A #### Installation -##### Using pip +##### Using pip * run `pip install wiktionaryparser` ##### From Source @@ -59,8 +59,7 @@ Note: This project will not be maintained since there are many free dictionary A #### Requirements - - requests==2.20.0 - - beautifulsoup4==4.4.0 +Python 3.10+ #### Contributions diff --git a/requirements.txt b/requirements.txt index 14bdefe..c8057ed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ -requests==2.20.0 -beautifulsoup4==4.9.1 -deepdiff==5.0.2 -parameterized==0.7.4 -requests-futures==1.0.0 -mock==4.0.2 -pylint==2.6.0 \ No newline at end of file +requests +beautifulsoup4 +deepdiff +parameterized +requests-futures +mock +pylint diff --git a/setup.py b/setup.py index 1a03648..9c16869 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -from setuptools import setup,find_packages +from setuptools import setup with open('readme.md', 'r') as readme: long_desc = readme.read() @@ -13,12 +13,12 @@ data_files=[('testOutput', ['tests/testOutput.json']), ('readme', ['readme.md']), ('requirements', ['requirements.txt'])], author = 'Suyash Behera', author_email = 'sne9x@outlook.com', - url = 'https://github.com/Suyash458/WiktionaryParser', - download_url = 'https://github.com/Suyash458/WiktionaryParser/archive/master.zip', + url = 'https://github.com/Suyash458/WiktionaryParser', + download_url = 'https://github.com/Suyash458/WiktionaryParser/archive/master.zip', keywords = ['Parser', 'Wiktionary'], - install_requires = ['beautifulsoup4','requests'], + install_requires = ['beautifulsoup4', 'requests'], classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'License :: OSI Approved :: MIT License', + 'Development Status :: 5 - Production/Stable', + 'License :: OSI Approved :: MIT License', ], -) \ No newline at end of file +) diff --git a/wiktionaryparser/core.py b/wiktionaryparser/core.py index 49f6617..d298f33 100644 --- a/wiktionaryparser/core.py +++ b/wiktionaryparser/core.py @@ -1,4 +1,6 @@ -import re, requests +import logging +import re +import requests from wiktionaryparser.utils import WordData, Definition, RelatedWord from bs4 import BeautifulSoup from itertools import zip_longest @@ -20,7 +22,7 @@ "coordinate terms", ] -def is_subheading(child, parent): +def is_subheading(child: str, parent: str) -> bool: child_headings = child.split(".") parent_headings = parent.split(".") if len(child_headings) <= len(parent_headings): @@ -30,60 +32,60 @@ def is_subheading(child, parent): return False return True -class WiktionaryParser(object): - def __init__(self): +class WiktionaryParser: + def __init__(self) -> None: self.url = "https://en.wiktionary.org/wiki/{}?printable=yes" self.soup = None self.session = requests.Session() self.session.mount("http://", requests.adapters.HTTPAdapter(max_retries = 2)) self.session.mount("https://", requests.adapters.HTTPAdapter(max_retries = 2)) - self.language = 'english' + self.language: str = 'english' self.current_word = None - self.PARTS_OF_SPEECH = copy(PARTS_OF_SPEECH) - self.RELATIONS = copy(RELATIONS) - self.INCLUDED_ITEMS = self.RELATIONS + self.PARTS_OF_SPEECH + ['etymology', 'pronunciation'] + self.PARTS_OF_SPEECH: list[str] = copy(PARTS_OF_SPEECH) + self.RELATIONS: list[str] = copy(RELATIONS) + self.INCLUDED_ITEMS: list[str] = self.RELATIONS + self.PARTS_OF_SPEECH + ['etymology', 'pronunciation'] - def include_part_of_speech(self, part_of_speech): + def include_part_of_speech(self, part_of_speech) -> None: part_of_speech = part_of_speech.lower() if part_of_speech not in self.PARTS_OF_SPEECH: self.PARTS_OF_SPEECH.append(part_of_speech) self.INCLUDED_ITEMS.append(part_of_speech) - def exclude_part_of_speech(self, part_of_speech): + def exclude_part_of_speech(self, part_of_speech) -> None: part_of_speech = part_of_speech.lower() self.PARTS_OF_SPEECH.remove(part_of_speech) self.INCLUDED_ITEMS.remove(part_of_speech) - def include_relation(self, relation): + def include_relation(self, relation: str) -> None: relation = relation.lower() if relation not in self.RELATIONS: self.RELATIONS.append(relation) self.INCLUDED_ITEMS.append(relation) - def exclude_relation(self, relation): + def exclude_relation(self, relation) -> None: relation = relation.lower() self.RELATIONS.remove(relation) self.INCLUDED_ITEMS.remove(relation) - def set_default_language(self, language=None): + def set_default_language(self, language=None) -> None: if language is not None: self.language = language.lower() - def get_default_language(self): + def get_default_language(self) -> str: return self.language - def clean_html(self): + def clean_html(self) -> None: unwanted_classes = ['sister-wikipedia', 'thumb', 'reference', 'cited-source'] for tag in self.soup.find_all(True, {'class': unwanted_classes}): tag.extract() - def remove_digits(self, string): + def remove_digits(self, string: str) -> str: return string.translate(str.maketrans('', '', digits)).strip() - def count_digits(self, string): + def count_digits(self, string: str) -> int: return len(list(filter(str.isdigit, string))) - def get_id_list(self, contents, content_type): + def get_id_list(self, contents: list, content_type: str) -> list[tuple[str, str, str]]: if content_type == 'etymologies': checklist = ['etymology'] elif content_type == 'pronunciation': @@ -96,7 +98,7 @@ def get_id_list(self, contents, content_type): checklist = self.RELATIONS else: return None - id_list = [] + id_list: list[tuple[str, str, str]] = [] if len(contents) == 0: return [('1', x.title(), x) for x in checklist if self.soup.find('span', {'id': x.title()})] for content_tag in contents: @@ -107,7 +109,7 @@ def get_id_list(self, contents, content_type): id_list.append((content_index, content_id, text_to_check)) return id_list - def get_word_data(self, language): + def get_word_data(self, language: str) -> list: contents = self.soup.find_all('span', {'class': 'toctext'}) word_contents = [] start_index = None @@ -139,7 +141,7 @@ def get_word_data(self, language): json_obj_list = self.map_to_object(word_data) return json_obj_list - def parse_pronunciations(self, word_contents): + def parse_pronunciations(self, word_contents) -> list: pronunciation_id_list = self.get_id_list(word_contents, 'pronunciation') pronunciation_list = [] audio_links = [] @@ -168,7 +170,7 @@ def parse_pronunciations(self, word_contents): pronunciation_list.append((pronunciation_index, pronunciation_text, audio_links)) return pronunciation_list - def parse_definitions(self, word_contents): + def parse_definitions(self, word_contents) -> list: definition_id_list = self.get_id_list(word_contents, 'definitions') definition_list = [] definition_tag = None @@ -191,7 +193,7 @@ def parse_definitions(self, word_contents): definition_list.append((def_index, definition_text, def_type)) return definition_list - def parse_examples(self, word_contents): + def parse_examples(self, word_contents) -> list: definition_id_list = self.get_id_list(word_contents, 'definitions') example_list = [] for def_index, def_id, def_type in definition_id_list: @@ -212,7 +214,7 @@ def parse_examples(self, word_contents): table = table.find_next_sibling() return example_list - def parse_etymologies(self, word_contents): + def parse_etymologies(self, word_contents) -> list: etymology_id_list = self.get_id_list(word_contents, 'etymologies') etymology_list = [] etymology_tag = None @@ -231,7 +233,7 @@ def parse_etymologies(self, word_contents): etymology_list.append((etymology_index, etymology_text)) return etymology_list - def parse_related_words(self, word_contents): + def parse_related_words(self, word_contents) -> list: relation_id_list = self.get_id_list(word_contents, 'related') related_words_list = [] for related_index, related_id, relation_type in relation_id_list: @@ -246,7 +248,7 @@ def parse_related_words(self, word_contents): related_words_list.append((related_index, words, relation_type)) return related_words_list - def map_to_object(self, word_data): + def map_to_object(self, word_data: dict) -> list: json_obj_list = [] if not word_data['etymologies']: word_data['etymologies'] = [('', '')] @@ -276,7 +278,7 @@ def map_to_object(self, word_data): json_obj_list.append(data_obj.to_json()) return json_obj_list - def fetch(self, word, language=None, old_id=None): + def fetch(self, word: str, language: str | None = None, old_id: int | None = None) -> list: language = self.language if not language else language response = self.session.get(self.url.format(word), params={'oldid': old_id}) self.soup = BeautifulSoup(response.text.replace('>\n<', '><'), 'html.parser') diff --git a/wiktionaryparser/py.typed b/wiktionaryparser/py.typed new file mode 100644 index 0000000..e69de29