Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions unrefactored/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import os
from src.handlers.AustrianHandler import AustrianHandler
from src.handlers.NorwegianHandler import NorwegianHandler

if __name__ == '__main__':
handler = AustrianHandler()
# # handler.run()
os.makedirs(handler.OUTPUT_DIR, exist_ok=True)
for name, payload in handler.render_scs():
with open(f'{handler.OUTPUT_DIR}{name}.scs', 'w', encoding="utf-8") as file:
file.write(payload)
26 changes: 26 additions & 0 deletions unrefactored/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
aspose-cells-python==23.11.0
attrs==23.1.0
certifi==2023.11.17
cffi==1.16.0
et-xmlfile==1.1.0
h11==0.14.0
idna==3.4
Jinja2==3.1.2
MarkupSafe==2.1.3
numpy==1.26.2
openpyxl==3.1.2
outcome==1.3.0.post0
pandas==2.1.3
pycparser==2.21
PySocks==1.7.1
python-dateutil==2.8.2
pytz==2023.3.post1
selenium==4.15.2
six==1.16.0
sniffio==1.3.0
sortedcontainers==2.4.0
trio==0.23.1
trio-websocket==0.11.1
tzdata==2023.3
urllib3==2.1.0
wsproto==1.2.0
212 changes: 212 additions & 0 deletions unrefactored/src/generics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
import os
import time
from string import punctuation
from abc import ABCMeta, abstractmethod

import pandas as pd
from selenium import webdriver
from jinja2 import Environment, FileSystemLoader
from utils.repair_xlsx import repair_xlsx
from utils.translate_text import translate


class AbstractHandler(metaclass=ABCMeta):

URL = None
OUTPUT_DIR = None

@abstractmethod
def generate(self) -> dict:
"""
A generator type object that yields processed .json objects ready to be inserted into template.
"""
pass

@abstractmethod
def render_scs(self) -> (str, str):
"""
Uses 'generate' method to retrieve data.
"""
pass

@abstractmethod
def run(self):
"""
Runs the whole download-parse-render-save pipeline.
"""
pass


class AbstractStreamHandler(AbstractHandler):

@abstractmethod
def generate_raw(self) -> dict:
"""
A generator type object responsible for parsing and streaming data from target site.
"""
pass


class AbstractXLSXHandler(AbstractHandler):

RAW_DATA_DIR = None

@abstractmethod
def download_xlsx(self) -> None:
"""
Downloads xlsx file from target site and saves it in filesystem.
"""
pass

def run(self):
self.download_xlsx()
os.makedirs(self.OUTPUT_DIR, exist_ok=True)
for name, payload in self.render_scs():
if name is not None:
with open(f'{self.OUTPUT_DIR}{name}.scs', 'w', encoding="utf-8") as file:
file.write(payload)


class StandartXLSXHandler(AbstractXLSXHandler):

INITIAL_WAIT_SECONDS = 0
TEXT_LINK_SEQUENCE = None
COUNTRY_CODE = None
FILENAME_RAW = None
XLSX_MAPPING = None

def __init__(self):
super().__init__()

null_fields = []
for field in ['TEXT_LINK_SEQUENCE', 'URL', 'COUNTRY_CODE', 'FILENAME_RAW', 'OUTPUT_DIR', 'RAW_DATA_DIR']:
if getattr(self, field) is None:
null_fields.append(field)

if len(null_fields) > 0:
raise Exception(f"Following fields can't be None: {', '.join(null_fields)}")

def download_xlsx(self) -> None:
"""
Downloads xlsx file from a target site and saves it in filesystem.
"""
options = webdriver.ChromeOptions()
os.makedirs(self.RAW_DATA_DIR, exist_ok=True)
prefs = {"download.default_directory": self.RAW_DATA_DIR}
options.add_experimental_option("prefs", prefs)

driver = webdriver.Chrome(options=options)
driver.maximize_window()

try:
# 1. Load page.
driver.get(self.URL)
time.sleep(self.INITIAL_WAIT_SECONDS)

# 2. Navigates to download button using sequence of text links.
for by, text, sleep in self.TEXT_LINK_SEQUENCE:
element = driver.find_element(
by,
text
)
element.click()
time.sleep(sleep)

# 3. Check if xlsx file is downloaded.
path = max([self.RAW_DATA_DIR + "\\" + f for f in os.listdir(self.RAW_DATA_DIR)], key=os.path.getctime)
filename = path.split('\\')[-1]

# 4. Rename file.
os.rename(src=f"{self.RAW_DATA_DIR}\\{filename}", dst=f"{self.RAW_DATA_DIR}\\{self.FILENAME_RAW}")

# 5. Repair xlsx file.
repair_xlsx(f"{self.RAW_DATA_DIR}\\{self.FILENAME_RAW}", f"{self.RAW_DATA_DIR}\\{self.FILENAME_RAW}")

except Exception as err:
print(err)

def generate(self) -> dict:
"""
A simple generator that yields .json entries for .scs templates.
"""
raw_df = pd.read_excel(f"{self.RAW_DATA_DIR}\\{self.FILENAME_RAW}")
raw_df = raw_df[self.XLSX_MAPPING.keys()]
raw_df = raw_df.dropna()
raw_df = raw_df.rename(columns=self.XLSX_MAPPING)
raw_df = raw_df.astype('str')
if 'dose_unit' in raw_df.columns:
raw_df = raw_df[raw_df['dose_unit'].str.contains('%') == 0]

# # Run script with "export PYTHONIOENCODING=UTF-8" or uncomment this:
# # ------------------------------------------------------------------
# for name in raw_df.columns:
# raw_df[name] = raw_df[name].apply(lambda x: x.encode('unicode_escape').decode())

if 'dose_value' in raw_df.columns:
raw_df['dose_value'] = raw_df['dose_value'].apply(lambda x: x.replace(';', '/'))
if 'dose_unit' in raw_df.columns:
raw_df['dose_unit'] = raw_df['dose_unit'].apply(lambda x: x.replace(';', '/'))

for idx, row in raw_df.iterrows():
out = row.to_dict()
for c in punctuation.replace('_', '').replace('%', '') + '„“–':
out['name'] = out['name'].replace(c, '')
out['name'] = "_".join(out['name'].split(' '))
if 'form' in raw_df.columns:
out['form'] = list(map(str.strip, out['form'].split(',')))
if 'dose_value' in raw_df.columns:
out['dose_value'] = out['dose_value'].replace(',', '_').replace('.', '_')
if 'dose_value' in raw_df.columns:
out['dose_value'] = list(map(str.strip, out['dose_value'].split('/')))
if 'dose_unit' in raw_df.columns:
out['dose_unit'] = list(map(str.strip, out['dose_unit'].split('/')))
if 'active_substance' in raw_df.columns:
out['active_substance'] = list(map(str.strip, out['active_substance'].split(';')))

if all([x in out.keys() for x in ['dose_value', 'dose_unit', 'form']]):

difference = len(out['dose_value']) - len(out['dose_unit'])

if difference == 1:
out['dose_unit'] = out['dose_unit'] + out['dose_unit']
elif difference == -1:
out['dose_value'] = out['dose_value'] + out['dose_value']

if len(out['dose_value']) > 2 or len(out['dose_unit']) > 2 or abs(difference) > 1:
continue

if not len(out['dose_value']) == len(out['dose_unit']) == len(out['form']):
continue

yield out

def render_scs(self) -> (str, str):
"""
Generator responsible for yielding tuples consisting of filenames and rendered jinja2 templates.
"""

jinja_env = Environment(loader=FileSystemLoader('./templates'))
template = jinja_env.get_template('template.scs')

for data in self.generate():

for key in data.keys():
if type(data[key]) == 'str':
data[key] = data[key].strip()

data['labels'] = [
(self.COUNTRY_CODE, data['name']),
('en', translate(data['name'])),
]

data['name'] = data['name'].lower()

if 'dose_value' in data.keys() and 'dose_unit' in data.keys():
for dvalue, dunit in zip(data['dose_value'], data['dose_unit']):
identifier = f'medication_{data["name"]}_{dvalue}{dunit}'
data['identifier'] = identifier
yield identifier, template.render(**data)
else:
identifier = f'medication_{data["name"]}'
data['identifier'] = identifier
yield identifier, template.render(**data)
59 changes: 59 additions & 0 deletions unrefactored/src/handlers/AustrianHandler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import sys

from selenium.webdriver.common.by import By
import pandas as pd
from string import punctuation

from src.generics import StandartXLSXHandler


class AustrianHandler(StandartXLSXHandler):

INITIAL_WAIT_SECONDS = 8
TEXT_LINK_SEQUENCE = [
(By.XPATH, "//button[contains(text(),'Suchen')]", 1),
(By.XPATH, "//img[@title='Trefferliste als .xls herunterladen']", 45)
]
URL = 'https://aspregister.basg.gv.at/aspregister/faces/aspregister.jspx'
COUNTRY_CODE = 'au'
FILENAME_RAW = f'data_{COUNTRY_CODE}.xlsx'
OUTPUT_DIR = f'output/{COUNTRY_CODE}/'
RAW_DATA_DIR = f"{sys.path[0]}\\data\\"
XLSX_MAPPING = {
'Name': 'name',
'ATC Code': 'code',
'Wirkstoff': 'active_substance',
'Inhaber/-in ': 'manufacturer'
}

def generate(self) -> dict:
"""
A simple generator that yields .json entries for .scs templates.
"""
raw_df = pd.read_excel(f"{self.RAW_DATA_DIR}\\{self.FILENAME_RAW}")
raw_df = raw_df[self.XLSX_MAPPING.keys()]
raw_df = raw_df.dropna()
raw_df = raw_df.rename(columns=self.XLSX_MAPPING)
raw_df = raw_df.astype('str')

for idx, row in raw_df.iterrows():
out = row.to_dict()
for c in punctuation.replace('_', '').replace('%', '') + '„“–':
out['name'] = out['name'].replace(c, '')
out['name'] = "_".join(out['name'].split(' '))

option_list = [out['name'].partition(x) for x in
['_mg_', '_mg/' '_g_', '_g/', '_E_', '_E/', '_I_', '_I/', '_mmol_', '_mmol/', '_%']]
name, dose_unit, _ = min(option_list, key=lambda x: len(x[0]))
name = name.strip()
if len(dose_unit) == 0:
continue

name, dose_value = "_".join(name.split('_')[:-1]), name.split('_')[-1]

out['name'] = name.strip('_')
out['dose_value'] = [dose_value.strip('_')]
out['dose_unit'] = [dose_unit.strip('_')]
out['active_substance'] = out['active_substance'].split()

yield out
27 changes: 27 additions & 0 deletions unrefactored/src/handlers/NorwegianHandler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import sys

from selenium.webdriver.common.by import By
from src.generics import StandartXLSXHandler


class NorwegianHandler(StandartXLSXHandler):

TEXT_LINK_SEQUENCE = [
(By.PARTIAL_LINK_TEXT, 'Pakninger', 6),
(By.PARTIAL_LINK_TEXT, 'Eksporter resultater', 6)
]
URL = 'https://www.legemiddelsok.no/'
COUNTRY_CODE = 'no'
FILENAME_RAW = f'data_{COUNTRY_CODE}.xlsx'
OUTPUT_DIR = f'output/{COUNTRY_CODE}/'
RAW_DATA_DIR = f"{sys.path[0]}\\data\\"
XLSX_MAPPING = {
'Handelsnavn': 'name',
'Form': 'form',
'Styrke tallverdi': 'dose_value',
'Styrke enhet': 'dose_unit',
'ATC-kode': 'code',
'Virkestoff': 'active_substance',
'MT-innehaver': 'manufacturer'
}

19 changes: 19 additions & 0 deletions unrefactored/templates/template.scs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{{identifier}}
=> nrel_main_idtf:{% for lang, label in labels %}
[{{label}}] (* <-lang_{{lang}};; *);{% endfor %}

<- rrel_key_sc_element: ...
(*
<- sc_definition;;
=> nrel_main_idtf:{% for lang, label in labels %}
[Def.({{label}})] (* <-lang_{{lang}};; *);{% endfor %};
*);

=> nrel_atc_code: {{code}};
{% if manufacturer %}=> nrel_company: {{manufacturer}};{% endif %}
{% if active_substance %}{% for substance in active_substance %}=> nrel_active_substances: {{substance}};
{% endfor %}{% endif %}
{% if form %}{% for f in form %}=> nrel_dosage_form: {{f}};
{% endfor %}{% endif %}

<-sc_node_not_relation;;
9 changes: 9 additions & 0 deletions unrefactored/utils/repair_xlsx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from aspose.cells import Workbook


def repair_xlsx(source: str, destination: str) -> None:
"""
Repairs and overrides broken xlsx file.
"""
workbook = Workbook(source)
workbook.save(destination)
7 changes: 7 additions & 0 deletions unrefactored/utils/translate_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@

def translate(text: str, dest_lang='no', targ_lang='en') -> str:
"""
A simple text translator.
"""
return text