Skip to content

Commit c0cee37

Browse files
committed
chore(sinan DAGS): create DAG to fetch dengue data from SINAN
1 parent 722af34 commit c0cee37

File tree

5 files changed

+140
-1
lines changed

5 files changed

+140
-1
lines changed

containers/airflow/Dockerfile

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,13 @@ RUN /usr/local/bin/python -m virtualenv /opt/envs/py311 --python="/opt/py311/bin
9494
&& source /opt/envs/py311/bin/activate \
9595
&& pip install "cython<3.0.0" \
9696
&& pip install --no-build-isolation "pyyaml<6.0" \
97-
&& pip install -r /opt/envs/pysus.txt
97+
&& pip install \
98+
psycopg2-binary \
99+
"apache-airflow>=2.7.1" \
100+
apache-airflow-providers-celery \
101+
redis \
102+
"dill>=0.3.7" \
103+
-r /opt/envs/pysus.txt
98104

99105
WORKDIR ${AIRFLOW_HOME}
100106

containers/airflow/dags/brasil/sinan/chikungunya.py

Whitespace-only changes.
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
import pendulum
2+
3+
from datetime import timedelta
4+
from airflow import DAG
5+
from airflow.decorators import task
6+
7+
8+
default_args = {
9+
"owner": "epigraphhub",
10+
"depends_on_past": False,
11+
"start_date": pendulum.datetime(2023, 1, 1),
12+
"email": ["epigraphhub@thegraphnetwork.org"],
13+
"email_on_failure": True,
14+
"email_on_retry": False,
15+
"retries": 2,
16+
"retry_delay": timedelta(minutes=1),
17+
}
18+
19+
with DAG(
20+
dag_id='SINAN_DENG',
21+
tags=['SINAN', 'Brasil', 'Dengue'],
22+
schedule='@monthly',
23+
default_args=default_args,
24+
catchup=False,
25+
) as dag:
26+
from airflow.models import Variable
27+
28+
CONN = Variable.get('egh_conn', deserialize_json=True)
29+
30+
@task.external_python(
31+
task_id='first',
32+
python='/opt/py311/bin/python3.11',
33+
expect_airflow=True
34+
)
35+
def update_dengue(egh_conn: dict):
36+
from pysus.online_data import parquets_to_dataframe
37+
from pysus.ftp.databases.sinan import SINAN
38+
39+
sinan = SINAN().load()
40+
dis_code = "DENG"
41+
tablename = "sinan_dengue_m"
42+
files = sinan.get_files(dis_code=disease)
43+
44+
f_stage = {}
45+
for file in files:
46+
code, year = sinan.format(file)
47+
stage = 'prelim' if 'PRELIM' in file.path else 'final'
48+
49+
if not stage in f_stage:
50+
f_stage[stage] = [year]
51+
else:
52+
f_stage[stage].append(year)
53+
54+
for year in f_stage['final']:
55+
# Check if final is already in DB
56+
with create_engine(egh_conn['URI']).connect() as conn:
57+
cur = conn.execute(
58+
f'SELECT COUNT(*) FROM brasil.{tablename}'
59+
f' WHERE year = {year} AND prelim = False'
60+
)
61+
count = cur.fetchone()
62+
63+
if not count:
64+
# Check on prelims
65+
with create_engine(egh_conn['URI']).connect() as conn:
66+
cur = conn.execute(
67+
f'SELECT COUNT(*) FROM brasil.{tablename}'
68+
f' WHERE year = {year} AND prelim = True'
69+
)
70+
count = cur.fetchone()
71+
72+
if count:
73+
# Update prelim to final
74+
cur = conn.execute(
75+
f'DELETE FROM brasil.{tablename}'
76+
f' WHERE year = {year} AND prelim = True'
77+
)
78+
79+
file = sinan.download(sinan.get_files(dis_code, year))
80+
81+
df = parquets_to_dataframe(file.path)
82+
df['year'] = year
83+
df['prelim'] = False
84+
df.to_sql(
85+
name=tablename,
86+
con=engine.connect(),
87+
schema=schema,
88+
if_exists='append',
89+
index=False
90+
)
91+
92+
for year in f_stage['prelim']:
93+
with create_engine(egh_conn['URI']).connect() as conn:
94+
# Update prelim
95+
cur = conn.execute(
96+
f'DELETE FROM brasil.{tablename}'
97+
f' WHERE year = {year} AND prelim = True'
98+
)
99+
100+
file = sinan.download(sinan.get_files(dis_code, year))
101+
102+
df = parquets_to_dataframe(file.path)
103+
df['year'] = year
104+
df['prelim'] = True
105+
df.to_sql(
106+
name=tablename,
107+
con=engine.connect(),
108+
schema=schema,
109+
if_exists='append',
110+
index=False
111+
)
112+
113+
update_dengue(CONN)

containers/airflow/dags/brasil/sinan/zika.py

Whitespace-only changes.

containers/airflow/env.tpl

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
AIRFLOW_PROJ_DIR=${AIRFLOW_PROJ_DIR}
2+
AIRFLOW_UID=${AIRFLOW_UID}
3+
AIRFLOW_PORT=${AIRFLOW_PORT}
4+
_AIRFLOW_WWW_USER_USERNAME=${_AIRFLOW_WWW_USER_USERNAME}
5+
_AIRFLOW_WWW_USER_PASSWORD=${_AIRFLOW_WWW_USER_PASSWORD}
6+
7+
AIRFLOW__CORE__FERNET_KEY=${AIRFLOW__CORE__FERNET_KEY}
8+
9+
AIRFLOW__SMTP__SMTP_HOST=${AIRFLOW__SMTP__SMTP_HOST}
10+
AIRFLOW__SMTP__SMTP_USER=${AIRFLOW__SMTP__SMTP_USER}
11+
AIRFLOW__SMTP__SMTP_PASSWORD=${AIRFLOW__SMTP__SMTP_PASSWORD}
12+
AIRFLOW__SMTP__SMTP_PORT=${AIRFLOW__SMTP__SMTP_PORT:-587}
13+
AIRFLOW__SMTP__SMTP_MAIL_FROM=${AIRFLOW__SMTP__SMTP_MAIL_FROM}
14+
15+
POSTGRES_EPIGRAPH_DB=${POSTGRES_EPIGRAPH_DB}
16+
POSTGRES_EPIGRAPH_HOST=${POSTGRES_EPIGRAPH_HOST}
17+
POSTGRES_EPIGRAPH_PORT=${POSTGRES_EPIGRAPH_PORT}
18+
POSTGRES_EPIGRAPH_USER=${POSTGRES_EPIGRAPH_USER}
19+
POSTGRES_EPIGRAPH_PASSWORD=${POSTGRES_EPIGRAPH_PASSWORD}
20+
AIRFLOW_VAR_EGH_CONN='{"URI":"postgresql://${POSTGRES_EPIGRAPH_USER}:${POSTGRES_EPIGRAPH_PASSWORD}@${POSTGRES_EPIGRAPH_HOST}:${POSTGRES_EPIGRAPH_PORT}/${POSTGRES_EPIGRAPH_DB}"}'

0 commit comments

Comments
 (0)