Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 23 additions & 23 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,35 +5,35 @@ name: Python package

on:
push:
branches: [ master ]
branches: [master]
pull_request:
branches: [ master ]
branches: [master]

jobs:
build:

runs-on: ubuntu-latest
runs-on: ubuntu-24.04
strategy:
matrix:
python-version: [3.6, 3.7, 3.8]
python-version: ["3.10", "3.11", "3.12", "3.13"]

steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install flake8 pytest
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
pytest
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install flake8 pytest
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
pytest
6 changes: 3 additions & 3 deletions .github/workflows/python-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@ jobs:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v2
uses: actions/setup-python@v5
with:
python-version: '3.x'
python-version: '3.13'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand Down
7 changes: 3 additions & 4 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

A python project which downloads words from English Wiktionary ([en.wiktionary.org](https://en.wiktionary.org)) and parses articles' content in an easy to use JSON format. Right now, it parses etymologies, definitions, pronunciations, examples, audio links and related words.

Note: This project will not be maintained since there are many free dictionary APIs now, please see - https://dictionaryapi.dev/ for example
There are many free dictionary APIs nowadays which may or may not make this project redundant for you, do check out https://dictionaryapi.dev, for example.

[![Downloads](http://pepy.tech/badge/wiktionaryparser)](http://pepy.tech/project/wiktionaryparser)

Expand All @@ -29,7 +29,7 @@ Note: This project will not be maintained since there are many free dictionary A

#### Installation

##### Using pip
##### Using pip
* run `pip install wiktionaryparser`

##### From Source
Expand Down Expand Up @@ -59,8 +59,7 @@ Note: This project will not be maintained since there are many free dictionary A

#### Requirements

- requests==2.20.0
- beautifulsoup4==4.4.0
Python 3.10+

#### Contributions

Expand Down
14 changes: 7 additions & 7 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
requests==2.20.0
beautifulsoup4==4.9.1
deepdiff==5.0.2
parameterized==0.7.4
requests-futures==1.0.0
mock==4.0.2
pylint==2.6.0
requests
beautifulsoup4
deepdiff
parameterized
requests-futures
mock
pylint
14 changes: 7 additions & 7 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from setuptools import setup,find_packages
from setuptools import setup

with open('readme.md', 'r') as readme:
long_desc = readme.read()
Expand All @@ -13,12 +13,12 @@
data_files=[('testOutput', ['tests/testOutput.json']), ('readme', ['readme.md']), ('requirements', ['requirements.txt'])],
author = 'Suyash Behera',
author_email = 'sne9x@outlook.com',
url = 'https://github.com/Suyash458/WiktionaryParser',
download_url = 'https://github.com/Suyash458/WiktionaryParser/archive/master.zip',
url = 'https://github.com/Suyash458/WiktionaryParser',
download_url = 'https://github.com/Suyash458/WiktionaryParser/archive/master.zip',
keywords = ['Parser', 'Wiktionary'],
install_requires = ['beautifulsoup4','requests'],
install_requires = ['beautifulsoup4', 'requests'],
classifiers=[
'Development Status :: 5 - Production/Stable',
'License :: OSI Approved :: MIT License',
'Development Status :: 5 - Production/Stable',
'License :: OSI Approved :: MIT License',
],
)
)
56 changes: 29 additions & 27 deletions wiktionaryparser/core.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import re, requests
import logging
import re
import requests
from wiktionaryparser.utils import WordData, Definition, RelatedWord
from bs4 import BeautifulSoup
from itertools import zip_longest
Expand All @@ -20,7 +22,7 @@
"coordinate terms",
]

def is_subheading(child, parent):
def is_subheading(child: str, parent: str) -> bool:
child_headings = child.split(".")
parent_headings = parent.split(".")
if len(child_headings) <= len(parent_headings):
Expand All @@ -30,60 +32,60 @@ def is_subheading(child, parent):
return False
return True

class WiktionaryParser(object):
def __init__(self):
class WiktionaryParser:
def __init__(self) -> None:
self.url = "https://en.wiktionary.org/wiki/{}?printable=yes"
self.soup = None
self.session = requests.Session()
self.session.mount("http://", requests.adapters.HTTPAdapter(max_retries = 2))
self.session.mount("https://", requests.adapters.HTTPAdapter(max_retries = 2))
self.language = 'english'
self.language: str = 'english'
self.current_word = None
self.PARTS_OF_SPEECH = copy(PARTS_OF_SPEECH)
self.RELATIONS = copy(RELATIONS)
self.INCLUDED_ITEMS = self.RELATIONS + self.PARTS_OF_SPEECH + ['etymology', 'pronunciation']
self.PARTS_OF_SPEECH: list[str] = copy(PARTS_OF_SPEECH)
self.RELATIONS: list[str] = copy(RELATIONS)
self.INCLUDED_ITEMS: list[str] = self.RELATIONS + self.PARTS_OF_SPEECH + ['etymology', 'pronunciation']

def include_part_of_speech(self, part_of_speech):
def include_part_of_speech(self, part_of_speech) -> None:
part_of_speech = part_of_speech.lower()
if part_of_speech not in self.PARTS_OF_SPEECH:
self.PARTS_OF_SPEECH.append(part_of_speech)
self.INCLUDED_ITEMS.append(part_of_speech)

def exclude_part_of_speech(self, part_of_speech):
def exclude_part_of_speech(self, part_of_speech) -> None:
part_of_speech = part_of_speech.lower()
self.PARTS_OF_SPEECH.remove(part_of_speech)
self.INCLUDED_ITEMS.remove(part_of_speech)

def include_relation(self, relation):
def include_relation(self, relation: str) -> None:
relation = relation.lower()
if relation not in self.RELATIONS:
self.RELATIONS.append(relation)
self.INCLUDED_ITEMS.append(relation)

def exclude_relation(self, relation):
def exclude_relation(self, relation) -> None:
relation = relation.lower()
self.RELATIONS.remove(relation)
self.INCLUDED_ITEMS.remove(relation)

def set_default_language(self, language=None):
def set_default_language(self, language=None) -> None:
if language is not None:
self.language = language.lower()

def get_default_language(self):
def get_default_language(self) -> str:
return self.language

def clean_html(self):
def clean_html(self) -> None:
unwanted_classes = ['sister-wikipedia', 'thumb', 'reference', 'cited-source']
for tag in self.soup.find_all(True, {'class': unwanted_classes}):
tag.extract()

def remove_digits(self, string):
def remove_digits(self, string: str) -> str:
return string.translate(str.maketrans('', '', digits)).strip()

def count_digits(self, string):
def count_digits(self, string: str) -> int:
return len(list(filter(str.isdigit, string)))

def get_id_list(self, contents, content_type):
def get_id_list(self, contents: list, content_type: str) -> list[tuple[str, str, str]]:
if content_type == 'etymologies':
checklist = ['etymology']
elif content_type == 'pronunciation':
Expand All @@ -96,7 +98,7 @@ def get_id_list(self, contents, content_type):
checklist = self.RELATIONS
else:
return None
id_list = []
id_list: list[tuple[str, str, str]] = []
if len(contents) == 0:
return [('1', x.title(), x) for x in checklist if self.soup.find('span', {'id': x.title()})]
for content_tag in contents:
Expand All @@ -107,7 +109,7 @@ def get_id_list(self, contents, content_type):
id_list.append((content_index, content_id, text_to_check))
return id_list

def get_word_data(self, language):
def get_word_data(self, language: str) -> list:
contents = self.soup.find_all('span', {'class': 'toctext'})
word_contents = []
start_index = None
Expand Down Expand Up @@ -139,7 +141,7 @@ def get_word_data(self, language):
json_obj_list = self.map_to_object(word_data)
return json_obj_list

def parse_pronunciations(self, word_contents):
def parse_pronunciations(self, word_contents) -> list:
pronunciation_id_list = self.get_id_list(word_contents, 'pronunciation')
pronunciation_list = []
audio_links = []
Expand Down Expand Up @@ -168,7 +170,7 @@ def parse_pronunciations(self, word_contents):
pronunciation_list.append((pronunciation_index, pronunciation_text, audio_links))
return pronunciation_list

def parse_definitions(self, word_contents):
def parse_definitions(self, word_contents) -> list:
definition_id_list = self.get_id_list(word_contents, 'definitions')
definition_list = []
definition_tag = None
Expand All @@ -191,7 +193,7 @@ def parse_definitions(self, word_contents):
definition_list.append((def_index, definition_text, def_type))
return definition_list

def parse_examples(self, word_contents):
def parse_examples(self, word_contents) -> list:
definition_id_list = self.get_id_list(word_contents, 'definitions')
example_list = []
for def_index, def_id, def_type in definition_id_list:
Expand All @@ -212,7 +214,7 @@ def parse_examples(self, word_contents):
table = table.find_next_sibling()
return example_list

def parse_etymologies(self, word_contents):
def parse_etymologies(self, word_contents) -> list:
etymology_id_list = self.get_id_list(word_contents, 'etymologies')
etymology_list = []
etymology_tag = None
Expand All @@ -231,7 +233,7 @@ def parse_etymologies(self, word_contents):
etymology_list.append((etymology_index, etymology_text))
return etymology_list

def parse_related_words(self, word_contents):
def parse_related_words(self, word_contents) -> list:
relation_id_list = self.get_id_list(word_contents, 'related')
related_words_list = []
for related_index, related_id, relation_type in relation_id_list:
Expand All @@ -246,7 +248,7 @@ def parse_related_words(self, word_contents):
related_words_list.append((related_index, words, relation_type))
return related_words_list

def map_to_object(self, word_data):
def map_to_object(self, word_data: dict) -> list:
json_obj_list = []
if not word_data['etymologies']:
word_data['etymologies'] = [('', '')]
Expand Down Expand Up @@ -276,7 +278,7 @@ def map_to_object(self, word_data):
json_obj_list.append(data_obj.to_json())
return json_obj_list

def fetch(self, word, language=None, old_id=None):
def fetch(self, word: str, language: str | None = None, old_id: int | None = None) -> list:
language = self.language if not language else language
response = self.session.get(self.url.format(word), params={'oldid': old_id})
self.soup = BeautifulSoup(response.text.replace('>\n<', '><'), 'html.parser')
Expand Down
Empty file added wiktionaryparser/py.typed
Empty file.