From 1d73b0c189345b6e3e7b18c7c2e5ef128dd0266a Mon Sep 17 00:00:00 2001 From: YOTTSO Date: Thu, 14 Dec 2023 16:54:24 +0300 Subject: [PATCH] added netherland website parser --- src/netherland-parse/LICENSE | 21 +++++ src/netherland-parse/README.md | 71 ++++++++++++++++ src/netherland-parse/example_of_parsing.json | 57 +++++++++++++ src/netherland-parse/parser.py | 85 ++++++++++++++++++++ src/netherland-parse/translator.py | 61 ++++++++++++++ 5 files changed, 295 insertions(+) create mode 100644 src/netherland-parse/LICENSE create mode 100644 src/netherland-parse/README.md create mode 100644 src/netherland-parse/example_of_parsing.json create mode 100644 src/netherland-parse/parser.py create mode 100644 src/netherland-parse/translator.py diff --git a/src/netherland-parse/LICENSE b/src/netherland-parse/LICENSE new file mode 100644 index 0000000..ab641cf --- /dev/null +++ b/src/netherland-parse/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 YOTTSO + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/src/netherland-parse/README.md b/src/netherland-parse/README.md new file mode 100644 index 0000000..5ab44ad --- /dev/null +++ b/src/netherland-parse/README.md @@ -0,0 +1,71 @@ +# README + +This repository contains two Python scripts: `translator.py` and `parser.py`. These scripts are designed to assist in translating and parsing medicine data from a CSV file and a website. + +## Translator + +The `translator.py` script provides functions for translating json database into scs files by using sample. It includes the following functions: + +- `translate_to_russian(text)`: Translates the given text to Russian using the Google Translate API. + +- `substances_check(substances)`: Check if substance is in active.scs(database for all substances). + +- `main()`: Build scs-file using json data. + +These functions can be used to translate medicines into scs by default in scs-out folder + +## Parser + +The `parser.py` script is used to parse medicine data from a CSV file and a website. It utilizes the Selenium library to scrape data from the website and the pandas library to read data from the CSV file. The script performs the following tasks: + +- Function `fix_name(data)`: This function takes a string or a list of strings as input and applies several transformations to ensure consistent naming conventions. It replaces whitespace, special characters, and converts the text to lowercase. + +- Function `get_composition_for_medicine(url)`: This function takes a URL of a medicine information page as input and scrapes the composition, active substances, and marketing authorization holder from the page using web scraping techniques. + +- Function `parse_medicines_from_csv(csv_url)`: This function reads medicine data from a CSV file located at the given URL. It iterates over the rows of the CSV file, extracts relevant information such as product names, potency, ATC codes, alternative names, pharmaceutical forms, and usage methods. It then calls the `get_composition_for_medicine` function to retrieve the composition, active substances, and marketing authorization holder for each medicine. + +The parsed data is stored in a JSON file named `result.json`. + +## Dependencies + +The following dependencies are required to run the scripts: + +- `googletrans`: A Python library for Google Translate API. + +- `selenium`: A Python library for web scraping and automation. + +- `pandas`: A powerful data manipulation library. + +Make sure to install these dependencies before running the scripts using the following command: + +``` +pip install googletrans selenium pandas +``` + +## Usage + +To use the scripts, follow these steps: + +1. Clone the repository to your local machine. + +2. Install the required dependencies using the command mentioned above. + +3. Open a terminal or command prompt and navigate to the directory containing the scripts. + +4. Run the `parser.py` script using the following command: + +``` +python parser.py +``` + +This will parse the medicine data from the CSV file and generate the `result.json` file containing the parsed data. + +5. Optionally, you can modify and use the `translator.py` script to perform translations or use the provided translation functions in your own code. + +## Note + +Please note that the `parser.py` script relies on web scraping techniques to extract data from a specific website. Make sure to comply with the website's terms of service and do not abuse or overload the website with excessive requests. + +## License + +This project is licensed under the [MIT License](LICENSE). Feel free to modify and distribute the code as per the terms of the license. diff --git a/src/netherland-parse/example_of_parsing.json b/src/netherland-parse/example_of_parsing.json new file mode 100644 index 0000000..306812c --- /dev/null +++ b/src/netherland-parse/example_of_parsing.json @@ -0,0 +1,57 @@ +[ + { + "Название препарата (национальный язык)": "desferal", + "Название препарата (английский)": "Lateral", + "ATC-код": "V03AC01", + "Дозировка": "nan", + "Действующие вещества": [ + "deferoxaminemesilaat_500_mg_flacon_samenstelling_overeenkomend_met", + "deferoxamine_426_8_mg_flacon" + ], + "Варианты альтернативного названия препарата": "deferoxamine", + "Производитель/держатель лицензии": "Medcor Pharmaceuticals B.V.Artemisweg 2328239 DE LELYSTAD", + "Состав": [ + "geen_hulpstoffen" + ], + "Фармацевтическая форма": "powder_for_solution_for_injection_or_infusion", + "Способы применения": "parenteral" + }, + { + "Название препарата (национальный язык)": "zoladex_10", + "Название препарата (английский)": "zoladex_10", + "ATC-код": "L02AE03", + "Дозировка": "nan", + "Действующие вещества": [ + "goserelineacetaat_samenstelling_overeenkomend_met", + "", + "gosereline_10_8_mg_stuk" + ], + "Варианты альтернативного названия препарата": "goserelin", + "Производитель/держатель лицензии": "Medcor Pharmaceuticals B.V.Artemisweg 2328239 DE LELYSTAD", + "Состав": [ + "melkzuur(d_l)_glycolzuur_copolymeer" + ], + "Фармацевтическая форма": "implantatietablet", + "Способы применения": "parenteral" + }, + { + "Название препарата (национальный язык)": "alutard_sq_2_huisstofmijten_100", + "Название препарата (английский)": "Alutard_sq_2_huisstofmites_100", + "ATC-код": "V01AA03", + "Дозировка": "nan", + "Действующие вещества": [ + "waterig_extract_van_een_mengsel_van_dermatophagoides_pteronyssinus_en_dermatophagoides_farinae_100000_sq_e_ml" + ], + "Варианты альтернативного названия препарата": "house_dust_mites", + "Производитель/держатель лицензии": "Medcor Pharmaceuticals B.V.Artemisweg 2328239 DE LELYSTAD", + "Состав": [ + "aluminiumhydroxide_0_water", + "fenol", + "natriumchloride", + "natriumwaterstofcarbonaat_(e_500_(ii))", + "water_voor_injectie" + ], + "Фармацевтическая форма": "suspension_for_injection", + "Способы применения": "subcutane" + } +] \ No newline at end of file diff --git a/src/netherland-parse/parser.py b/src/netherland-parse/parser.py new file mode 100644 index 0000000..426c009 --- /dev/null +++ b/src/netherland-parse/parser.py @@ -0,0 +1,85 @@ +import json +from googletrans import Translator +from selenium import webdriver +from selenium.webdriver.common.by import By +import re +import pandas as pd + +def translate_to_english(text): + translator = Translator() + translation = translator.translate(text, dest='en') + return translation.text + +def fix_name(data): + if isinstance(data, str): + return re.sub("[\s/.,-]+", "_", data.lower().replace('\\', '')) + if isinstance(data, list): + for i in range(len(data)): + data[i] = data[i].replace(' ', '_').lower() + return data +def get_composition_for_medicine(url): + driver = webdriver.Chrome() + try: + driver.get(url) + try: + composition_element = driver.find_element(By.XPATH, "//td[text()='Excipients:']/following-sibling::td") + composition_text = composition_element.text + components = [fix_name(component.strip()) for component in composition_text.split('\n')] + except: + components = None + try: + active_element = driver.find_element(By.XPATH, "//td[text()='Active substance:']/following-sibling::td") + active_text = active_element.text + active = [fix_name(component.strip()) for component in active_text.split('\n')] + except: + active = None + try: + holder_element = driver.find_element(By.XPATH, '//li[contains(text(), "Marketing authorisation holder:")]') + holder_text = holder_element.find_element(By.CLASS_NAME, 'pull-right').get_attribute("textContent") + except: + holder_text = None + return components, fix_name(active), holder_text + finally: + driver.quit() + +def parse_medicines_from_csv(csv_url): + df = pd.read_csv(csv_url, sep='|') + + result_data = [] + + for index, row in df.iterrows(): + name = re.split('[,./]', row['PRODUCTNAAM'], 1) + product_name = fix_name(name[0]) + translated_name = translate_to_english(product_name) + potency = row['POTENTIE'] + atc_raw = row['ATC'].split(' - ') + atc_code = atc_raw[0] + alt_name = atc_raw[1] + farm_form = row['FARMACEUTISCHEVORM'] + using_way = row['TOEDIENINGSWEG'] + + registration_number_digits = int(re.findall(r'(\d+)[/=]', row['REGISTRATIENUMMER'])[0]) + + medicine_url = (f"https://www.geneesmiddeleninformatiebank.nl/ords/f?p=111:3::SEARCH:::P0_DOMAIN,P0_LANG,P3_RVG1:H,EN," + f"{registration_number_digits}") + + composition,active, licensiator = get_composition_for_medicine(medicine_url) + + result_data.append({ + 'Название препарата (национальный язык)': product_name, + 'Название препарата (английский)': translated_name, + 'ATC-код': atc_code, + 'Дозировка': f"{potency}", + 'Действующие вещества': active, + 'Варианты альтернативного названия препарата': fix_name(alt_name), + 'Производитель/держатель лицензии': licensiator, + 'Состав': composition, + 'Фармацевтическая форма': fix_name(translate_to_english(farm_form)), + 'Способы применения': fix_name(translate_to_english(using_way)) + }) + with open('result.json', 'w', encoding='utf-8') as json_file: + json.dump(result_data, json_file, ensure_ascii=False, indent=4) + +if __name__ == '__main__': + csv_url = "https://www.geneesmiddeleninformatiebank.nl/metadata.csv" + parse_medicines_from_csv(csv_url) diff --git a/src/netherland-parse/translator.py b/src/netherland-parse/translator.py new file mode 100644 index 0000000..728e89a --- /dev/null +++ b/src/netherland-parse/translator.py @@ -0,0 +1,61 @@ +import json +import re +from googletrans import Translator + + +def translate_to_russian(text): + translator = Translator() + translation = translator.translate(text, dest='ru') + return translation.text + + +def substances_check(substances): + with open('extra/active.scs', 'r') as actives_file: + actives = actives_file.read() + for substance in substances: + if substance not in actives: + substance_identifier = substance.replace('_', ' ') + substance_identifier_ru = translate_to_russian(substance_identifier) + active_template = f"""{substance} + =>nrel_main_idtf:[{substance_identifier_ru}] (* <-lang_ru;;*); + =>nrel_main_idtf:[{substance_identifier}] (* <-lang_en;;*);; + """ + file_name = f"extra/active.scs" + with open(file_name, 'a') as file: + file.write(active_template + '\n') + + +if '__name__' == '__main__': + with open('result.json', 'r') as file: + data = json.load(file) + + for item in data: + name = item["Название препарата (английский)"] + identifier = name.replace('_', ' ') + identifier_ru = translate_to_russian(identifier) + dose = item["Дозировка"] + ATC = item["ATC-код"] + P_name = item["Варианты альтернативного названия препарата"] + license_holder = item["Производитель/держатель лицензии"] + dos_form = item["Фармацевтическая форма"] + usage_method = item["Способы применения"] + active = item["Действующие вещества"] + substances_check(active) + + drug_template = f"""medication_{name}_{dose} + <-sc_node_not_relation; + => nrel_main_idtf: + [{identifier} ({dose})] (* <- lang_en;;*); + [{identifier} ({dose})] (* <- lang_nl;;*); + => nrel_atc_code: {ATC}; + => nrel_international_non_proprietary_name: [{P_name}]; + => nrel_company: [{license_holder}]; + => nrel_countries_of_sale: Netherlands; + => nrel_active_substances: {active}; + => nrel_dosage_form: {dos_form}; + => nrel_usage_method: {usage_method}; + => nrel_route_of_administration:concept_parenteral_route;;""" + + file_name = f"scs_out/{name}.scs" + with open(file_name, 'w') as file: + file.write(drug_template)