From 876f43b79266c16ee32ae28fb1e13b624c238dd5 Mon Sep 17 00:00:00 2001 From: alipheesa Date: Sun, 17 Dec 2023 15:04:49 +0300 Subject: [PATCH] feat: Added norwegian and austrian parsers --- unrefactored/parser.py | 11 + unrefactored/requirements.txt | 26 +++ unrefactored/src/generics.py | 212 ++++++++++++++++++ unrefactored/src/handlers/AustrianHandler.py | 59 +++++ unrefactored/src/handlers/NorwegianHandler.py | 27 +++ unrefactored/templates/template.scs | 19 ++ unrefactored/utils/repair_xlsx.py | 9 + unrefactored/utils/translate_text.py | 7 + 8 files changed, 370 insertions(+) create mode 100644 unrefactored/parser.py create mode 100644 unrefactored/requirements.txt create mode 100644 unrefactored/src/generics.py create mode 100644 unrefactored/src/handlers/AustrianHandler.py create mode 100644 unrefactored/src/handlers/NorwegianHandler.py create mode 100644 unrefactored/templates/template.scs create mode 100644 unrefactored/utils/repair_xlsx.py create mode 100644 unrefactored/utils/translate_text.py diff --git a/unrefactored/parser.py b/unrefactored/parser.py new file mode 100644 index 0000000..64e16f3 --- /dev/null +++ b/unrefactored/parser.py @@ -0,0 +1,11 @@ +import os +from src.handlers.AustrianHandler import AustrianHandler +from src.handlers.NorwegianHandler import NorwegianHandler + +if __name__ == '__main__': + handler = AustrianHandler() + # # handler.run() + os.makedirs(handler.OUTPUT_DIR, exist_ok=True) + for name, payload in handler.render_scs(): + with open(f'{handler.OUTPUT_DIR}{name}.scs', 'w', encoding="utf-8") as file: + file.write(payload) diff --git a/unrefactored/requirements.txt b/unrefactored/requirements.txt new file mode 100644 index 0000000..be5bbba --- /dev/null +++ b/unrefactored/requirements.txt @@ -0,0 +1,26 @@ +aspose-cells-python==23.11.0 +attrs==23.1.0 +certifi==2023.11.17 +cffi==1.16.0 +et-xmlfile==1.1.0 +h11==0.14.0 +idna==3.4 +Jinja2==3.1.2 +MarkupSafe==2.1.3 +numpy==1.26.2 +openpyxl==3.1.2 +outcome==1.3.0.post0 +pandas==2.1.3 +pycparser==2.21 +PySocks==1.7.1 +python-dateutil==2.8.2 +pytz==2023.3.post1 +selenium==4.15.2 +six==1.16.0 +sniffio==1.3.0 +sortedcontainers==2.4.0 +trio==0.23.1 +trio-websocket==0.11.1 +tzdata==2023.3 +urllib3==2.1.0 +wsproto==1.2.0 diff --git a/unrefactored/src/generics.py b/unrefactored/src/generics.py new file mode 100644 index 0000000..bb96354 --- /dev/null +++ b/unrefactored/src/generics.py @@ -0,0 +1,212 @@ +import os +import time +from string import punctuation +from abc import ABCMeta, abstractmethod + +import pandas as pd +from selenium import webdriver +from jinja2 import Environment, FileSystemLoader +from utils.repair_xlsx import repair_xlsx +from utils.translate_text import translate + + +class AbstractHandler(metaclass=ABCMeta): + + URL = None + OUTPUT_DIR = None + + @abstractmethod + def generate(self) -> dict: + """ + A generator type object that yields processed .json objects ready to be inserted into template. + """ + pass + + @abstractmethod + def render_scs(self) -> (str, str): + """ + Uses 'generate' method to retrieve data. + """ + pass + + @abstractmethod + def run(self): + """ + Runs the whole download-parse-render-save pipeline. + """ + pass + + +class AbstractStreamHandler(AbstractHandler): + + @abstractmethod + def generate_raw(self) -> dict: + """ + A generator type object responsible for parsing and streaming data from target site. + """ + pass + + +class AbstractXLSXHandler(AbstractHandler): + + RAW_DATA_DIR = None + + @abstractmethod + def download_xlsx(self) -> None: + """ + Downloads xlsx file from target site and saves it in filesystem. + """ + pass + + def run(self): + self.download_xlsx() + os.makedirs(self.OUTPUT_DIR, exist_ok=True) + for name, payload in self.render_scs(): + if name is not None: + with open(f'{self.OUTPUT_DIR}{name}.scs', 'w', encoding="utf-8") as file: + file.write(payload) + + +class StandartXLSXHandler(AbstractXLSXHandler): + + INITIAL_WAIT_SECONDS = 0 + TEXT_LINK_SEQUENCE = None + COUNTRY_CODE = None + FILENAME_RAW = None + XLSX_MAPPING = None + + def __init__(self): + super().__init__() + + null_fields = [] + for field in ['TEXT_LINK_SEQUENCE', 'URL', 'COUNTRY_CODE', 'FILENAME_RAW', 'OUTPUT_DIR', 'RAW_DATA_DIR']: + if getattr(self, field) is None: + null_fields.append(field) + + if len(null_fields) > 0: + raise Exception(f"Following fields can't be None: {', '.join(null_fields)}") + + def download_xlsx(self) -> None: + """ + Downloads xlsx file from a target site and saves it in filesystem. + """ + options = webdriver.ChromeOptions() + os.makedirs(self.RAW_DATA_DIR, exist_ok=True) + prefs = {"download.default_directory": self.RAW_DATA_DIR} + options.add_experimental_option("prefs", prefs) + + driver = webdriver.Chrome(options=options) + driver.maximize_window() + + try: + # 1. Load page. + driver.get(self.URL) + time.sleep(self.INITIAL_WAIT_SECONDS) + + # 2. Navigates to download button using sequence of text links. + for by, text, sleep in self.TEXT_LINK_SEQUENCE: + element = driver.find_element( + by, + text + ) + element.click() + time.sleep(sleep) + + # 3. Check if xlsx file is downloaded. + path = max([self.RAW_DATA_DIR + "\\" + f for f in os.listdir(self.RAW_DATA_DIR)], key=os.path.getctime) + filename = path.split('\\')[-1] + + # 4. Rename file. + os.rename(src=f"{self.RAW_DATA_DIR}\\{filename}", dst=f"{self.RAW_DATA_DIR}\\{self.FILENAME_RAW}") + + # 5. Repair xlsx file. + repair_xlsx(f"{self.RAW_DATA_DIR}\\{self.FILENAME_RAW}", f"{self.RAW_DATA_DIR}\\{self.FILENAME_RAW}") + + except Exception as err: + print(err) + + def generate(self) -> dict: + """ + A simple generator that yields .json entries for .scs templates. + """ + raw_df = pd.read_excel(f"{self.RAW_DATA_DIR}\\{self.FILENAME_RAW}") + raw_df = raw_df[self.XLSX_MAPPING.keys()] + raw_df = raw_df.dropna() + raw_df = raw_df.rename(columns=self.XLSX_MAPPING) + raw_df = raw_df.astype('str') + if 'dose_unit' in raw_df.columns: + raw_df = raw_df[raw_df['dose_unit'].str.contains('%') == 0] + + # # Run script with "export PYTHONIOENCODING=UTF-8" or uncomment this: + # # ------------------------------------------------------------------ + # for name in raw_df.columns: + # raw_df[name] = raw_df[name].apply(lambda x: x.encode('unicode_escape').decode()) + + if 'dose_value' in raw_df.columns: + raw_df['dose_value'] = raw_df['dose_value'].apply(lambda x: x.replace(';', '/')) + if 'dose_unit' in raw_df.columns: + raw_df['dose_unit'] = raw_df['dose_unit'].apply(lambda x: x.replace(';', '/')) + + for idx, row in raw_df.iterrows(): + out = row.to_dict() + for c in punctuation.replace('_', '').replace('%', '') + '„“–': + out['name'] = out['name'].replace(c, '') + out['name'] = "_".join(out['name'].split(' ')) + if 'form' in raw_df.columns: + out['form'] = list(map(str.strip, out['form'].split(','))) + if 'dose_value' in raw_df.columns: + out['dose_value'] = out['dose_value'].replace(',', '_').replace('.', '_') + if 'dose_value' in raw_df.columns: + out['dose_value'] = list(map(str.strip, out['dose_value'].split('/'))) + if 'dose_unit' in raw_df.columns: + out['dose_unit'] = list(map(str.strip, out['dose_unit'].split('/'))) + if 'active_substance' in raw_df.columns: + out['active_substance'] = list(map(str.strip, out['active_substance'].split(';'))) + + if all([x in out.keys() for x in ['dose_value', 'dose_unit', 'form']]): + + difference = len(out['dose_value']) - len(out['dose_unit']) + + if difference == 1: + out['dose_unit'] = out['dose_unit'] + out['dose_unit'] + elif difference == -1: + out['dose_value'] = out['dose_value'] + out['dose_value'] + + if len(out['dose_value']) > 2 or len(out['dose_unit']) > 2 or abs(difference) > 1: + continue + + if not len(out['dose_value']) == len(out['dose_unit']) == len(out['form']): + continue + + yield out + + def render_scs(self) -> (str, str): + """ + Generator responsible for yielding tuples consisting of filenames and rendered jinja2 templates. + """ + + jinja_env = Environment(loader=FileSystemLoader('./templates')) + template = jinja_env.get_template('template.scs') + + for data in self.generate(): + + for key in data.keys(): + if type(data[key]) == 'str': + data[key] = data[key].strip() + + data['labels'] = [ + (self.COUNTRY_CODE, data['name']), + ('en', translate(data['name'])), + ] + + data['name'] = data['name'].lower() + + if 'dose_value' in data.keys() and 'dose_unit' in data.keys(): + for dvalue, dunit in zip(data['dose_value'], data['dose_unit']): + identifier = f'medication_{data["name"]}_{dvalue}{dunit}' + data['identifier'] = identifier + yield identifier, template.render(**data) + else: + identifier = f'medication_{data["name"]}' + data['identifier'] = identifier + yield identifier, template.render(**data) diff --git a/unrefactored/src/handlers/AustrianHandler.py b/unrefactored/src/handlers/AustrianHandler.py new file mode 100644 index 0000000..c4bbbcb --- /dev/null +++ b/unrefactored/src/handlers/AustrianHandler.py @@ -0,0 +1,59 @@ +import sys + +from selenium.webdriver.common.by import By +import pandas as pd +from string import punctuation + +from src.generics import StandartXLSXHandler + + +class AustrianHandler(StandartXLSXHandler): + + INITIAL_WAIT_SECONDS = 8 + TEXT_LINK_SEQUENCE = [ + (By.XPATH, "//button[contains(text(),'Suchen')]", 1), + (By.XPATH, "//img[@title='Trefferliste als .xls herunterladen']", 45) + ] + URL = 'https://aspregister.basg.gv.at/aspregister/faces/aspregister.jspx' + COUNTRY_CODE = 'au' + FILENAME_RAW = f'data_{COUNTRY_CODE}.xlsx' + OUTPUT_DIR = f'output/{COUNTRY_CODE}/' + RAW_DATA_DIR = f"{sys.path[0]}\\data\\" + XLSX_MAPPING = { + 'Name': 'name', + 'ATC Code': 'code', + 'Wirkstoff': 'active_substance', + 'Inhaber/-in ': 'manufacturer' + } + + def generate(self) -> dict: + """ + A simple generator that yields .json entries for .scs templates. + """ + raw_df = pd.read_excel(f"{self.RAW_DATA_DIR}\\{self.FILENAME_RAW}") + raw_df = raw_df[self.XLSX_MAPPING.keys()] + raw_df = raw_df.dropna() + raw_df = raw_df.rename(columns=self.XLSX_MAPPING) + raw_df = raw_df.astype('str') + + for idx, row in raw_df.iterrows(): + out = row.to_dict() + for c in punctuation.replace('_', '').replace('%', '') + '„“–': + out['name'] = out['name'].replace(c, '') + out['name'] = "_".join(out['name'].split(' ')) + + option_list = [out['name'].partition(x) for x in + ['_mg_', '_mg/' '_g_', '_g/', '_E_', '_E/', '_I_', '_I/', '_mmol_', '_mmol/', '_%']] + name, dose_unit, _ = min(option_list, key=lambda x: len(x[0])) + name = name.strip() + if len(dose_unit) == 0: + continue + + name, dose_value = "_".join(name.split('_')[:-1]), name.split('_')[-1] + + out['name'] = name.strip('_') + out['dose_value'] = [dose_value.strip('_')] + out['dose_unit'] = [dose_unit.strip('_')] + out['active_substance'] = out['active_substance'].split() + + yield out diff --git a/unrefactored/src/handlers/NorwegianHandler.py b/unrefactored/src/handlers/NorwegianHandler.py new file mode 100644 index 0000000..9fc802c --- /dev/null +++ b/unrefactored/src/handlers/NorwegianHandler.py @@ -0,0 +1,27 @@ +import sys + +from selenium.webdriver.common.by import By +from src.generics import StandartXLSXHandler + + +class NorwegianHandler(StandartXLSXHandler): + + TEXT_LINK_SEQUENCE = [ + (By.PARTIAL_LINK_TEXT, 'Pakninger', 6), + (By.PARTIAL_LINK_TEXT, 'Eksporter resultater', 6) + ] + URL = 'https://www.legemiddelsok.no/' + COUNTRY_CODE = 'no' + FILENAME_RAW = f'data_{COUNTRY_CODE}.xlsx' + OUTPUT_DIR = f'output/{COUNTRY_CODE}/' + RAW_DATA_DIR = f"{sys.path[0]}\\data\\" + XLSX_MAPPING = { + 'Handelsnavn': 'name', + 'Form': 'form', + 'Styrke tallverdi': 'dose_value', + 'Styrke enhet': 'dose_unit', + 'ATC-kode': 'code', + 'Virkestoff': 'active_substance', + 'MT-innehaver': 'manufacturer' + } + diff --git a/unrefactored/templates/template.scs b/unrefactored/templates/template.scs new file mode 100644 index 0000000..4386d1b --- /dev/null +++ b/unrefactored/templates/template.scs @@ -0,0 +1,19 @@ +{{identifier}} +=> nrel_main_idtf:{% for lang, label in labels %} + [{{label}}] (* <-lang_{{lang}};; *);{% endfor %} + +<- rrel_key_sc_element: ... + (* + <- sc_definition;; + => nrel_main_idtf:{% for lang, label in labels %} + [Def.({{label}})] (* <-lang_{{lang}};; *);{% endfor %}; + *); + +=> nrel_atc_code: {{code}}; +{% if manufacturer %}=> nrel_company: {{manufacturer}};{% endif %} +{% if active_substance %}{% for substance in active_substance %}=> nrel_active_substances: {{substance}}; +{% endfor %}{% endif %} +{% if form %}{% for f in form %}=> nrel_dosage_form: {{f}}; +{% endfor %}{% endif %} + +<-sc_node_not_relation;; diff --git a/unrefactored/utils/repair_xlsx.py b/unrefactored/utils/repair_xlsx.py new file mode 100644 index 0000000..adba589 --- /dev/null +++ b/unrefactored/utils/repair_xlsx.py @@ -0,0 +1,9 @@ +from aspose.cells import Workbook + + +def repair_xlsx(source: str, destination: str) -> None: + """ + Repairs and overrides broken xlsx file. + """ + workbook = Workbook(source) + workbook.save(destination) diff --git a/unrefactored/utils/translate_text.py b/unrefactored/utils/translate_text.py new file mode 100644 index 0000000..5a5cd53 --- /dev/null +++ b/unrefactored/utils/translate_text.py @@ -0,0 +1,7 @@ + +def translate(text: str, dest_lang='no', targ_lang='en') -> str: + """ + A simple text translator. + """ + return text +