diff --git a/README.md b/README.md index 558c133..e9063b9 100644 --- a/README.md +++ b/README.md @@ -1 +1,86 @@ -# execution_scripts \ No newline at end of file +# execution_scripts + +Shell scripts for submitting and managing Spark workloads across multiple execution engines. + +## Supported Engines + +| Engine | Directory | Description | +|--------|-----------|-------------| +| **Databricks** | [`databricks/`](databricks/) | Trigger existing jobs or run notebooks via the Databricks Jobs API 2.1 | +| **Spark Standalone** | [`spark-standalone/`](spark-standalone/) | Start/stop a local Spark cluster and submit jobs via `spark-submit` | +| **Google Cloud Dataproc** | [`dataproc/`](dataproc/) | Create/delete Dataproc clusters and submit Spark/PySpark/Hadoop jobs via `gcloud` | + +## Quick Start + +### Databricks + +```bash +export DATABRICKS_HOST="https://.azuredatabricks.net" +export DATABRICKS_TOKEN="dapi..." + +# Trigger an existing job and wait for completion +./databricks/run_job.sh --job-id 42 --wait + +# Run a notebook once on an existing cluster +./databricks/submit_notebook.sh \ + --notebook-path /Shared/etl/my_notebook \ + --cluster-id \ + --wait +``` + +### Spark Standalone + +```bash +export SPARK_HOME=/opt/spark + +# Start master + 2 workers +./spark-standalone/start_cluster.sh --workers 2 --worker-cores 4 --worker-memory 8g + +# Submit a PySpark job +./spark-standalone/submit_job.sh --app /path/to/etl.py + +# Stop the cluster +./spark-standalone/stop_cluster.sh +``` + +### Google Cloud Dataproc + +```bash +export GCP_PROJECT="my-gcp-project" +export GCP_REGION="us-central1" + +# Create a cluster +./dataproc/create_cluster.sh --cluster-name my-cluster --num-workers 4 + +# Submit a PySpark job +./dataproc/submit_job.sh \ + --cluster-name my-cluster \ + --job-type pyspark \ + --app gs://my-bucket/etl.py \ + -- --date 2024-01-01 + +# Delete the cluster +./dataproc/delete_cluster.sh --cluster-name my-cluster --yes +``` + +## Repository Structure + +``` +execution_scripts/ +├── databricks/ +│ ├── README.md +│ ├── run_job.sh # Trigger an existing Databricks job +│ └── submit_notebook.sh # Run a notebook as a one-time run +├── spark-standalone/ +│ ├── README.md +│ ├── start_cluster.sh # Start Spark master + workers +│ ├── stop_cluster.sh # Stop the cluster +│ └── submit_job.sh # Submit a job via spark-submit +└── dataproc/ + ├── README.md + ├── create_cluster.sh # Create a Dataproc cluster + ├── delete_cluster.sh # Delete a Dataproc cluster + └── submit_job.sh # Submit a job to Dataproc +``` + +For engine-specific usage and options, refer to the README in each subdirectory. \ No newline at end of file diff --git a/databricks/README.md b/databricks/README.md new file mode 100644 index 0000000..1b7395e --- /dev/null +++ b/databricks/README.md @@ -0,0 +1,87 @@ +# Databricks Execution Scripts + +Shell scripts to interact with Databricks workspaces via the **Jobs API 2.1** and the `runs/submit` endpoint. + +## Prerequisites + +| Tool | Purpose | +|------|---------| +| `curl` | HTTP calls to the Databricks REST API | +| `jq` | JSON parsing / construction | +| `python3` | URL-encoding job names in `run_job.sh` | + +Set the following environment variables before running any script: + +```bash +export DATABRICKS_HOST="https://.azuredatabricks.net" +export DATABRICKS_TOKEN="dapi..." # personal-access token or SP OAuth token +``` + +--- + +## Scripts + +### `run_job.sh` – Trigger an existing job + +Triggers a run for an existing Databricks job (by ID or name) and optionally waits for completion. + +``` +Usage: ./run_job.sh [OPTIONS] + +Options: + -j, --job-id Existing job ID to trigger + -n, --job-name Look up job ID by name + -p, --params Notebook/task parameters as a JSON string + -w, --wait Block until the run finishes + -t, --timeout Polling timeout (default: 3600) + -h, --help Show help +``` + +**Examples:** + +```bash +# Trigger job by ID and wait +./run_job.sh --job-id 42 --wait + +# Trigger job by name with parameters +./run_job.sh --job-name "nightly_etl" \ + --params '{"date":"2024-01-01","env":"prod"}' \ + --wait +``` + +--- + +### `submit_notebook.sh` – Run a notebook as a one-time run + +Submits a notebook to run immediately on an existing or a new ephemeral cluster. + +``` +Usage: ./submit_notebook.sh [OPTIONS] + +Options: + -n, --notebook-path Workspace path to the notebook (required) + -c, --cluster-id Existing cluster ID + --new-cluster New-cluster specification JSON + -p, --params Notebook base parameters + -r, --run-name Human-readable run name + -w, --wait Block until the run finishes + -t, --timeout Polling timeout (default: 3600) + -h, --help Show help +``` + +**Examples:** + +```bash +# Run a notebook on an existing cluster +./submit_notebook.sh \ + --notebook-path /Shared/etl/transform \ + --cluster-id 1234-567890-abc12345 \ + --params '{"date":"2024-01-01"}' \ + --wait + +# Run a notebook on a new ephemeral cluster +./submit_notebook.sh \ + --notebook-path /Shared/etl/transform \ + --new-cluster '{"num_workers":4,"spark_version":"14.3.x-scala2.12","node_type_id":"m5d.xlarge"}' \ + --wait +``` diff --git a/databricks/run_job.sh b/databricks/run_job.sh new file mode 100755 index 0000000..88e0272 --- /dev/null +++ b/databricks/run_job.sh @@ -0,0 +1,130 @@ +#!/usr/bin/env bash +# run_job.sh – Submit and optionally wait for a Databricks job run. +# +# Usage: +# ./run_job.sh [OPTIONS] +# +# Required environment variables (or pass via flags): +# DATABRICKS_HOST – Workspace URL, e.g. https://.azuredatabricks.net +# DATABRICKS_TOKEN – Personal-access token (or SP OAuth token) +# +# Options: +# -j, --job-id Existing job ID to trigger a run +# -n, --job-name Look up job ID by name (mutually exclusive with -j) +# -p, --params Notebook/task parameters as a JSON string (optional) +# -w, --wait Block until the run finishes and exit with its result code +# -t, --timeout Polling timeout when --wait is set (default: 3600) +# -h, --help Show this help message +# +# Examples: +# DATABRICKS_HOST=https://adb-xxx.azuredatabricks.net \ +# DATABRICKS_TOKEN=dapi... \ +# ./run_job.sh --job-id 42 --wait +# +# ./run_job.sh --job-name "nightly_etl" --params '{"date":"2024-01-01"}' --wait + +set -euo pipefail + +# ── defaults ─────────────────────────────────────────────────────────────────── +WAIT=false +TIMEOUT=3600 +POLL_INTERVAL=15 +JOB_ID="" +JOB_NAME="" +PARAMS="" + +# ── helpers ──────────────────────────────────────────────────────────────────── +usage() { + grep '^#' "$0" | sed 's/^# \{0,1\}//' + exit 0 +} + +die() { echo "ERROR: $*" >&2; exit 1; } + +require_cmd() { command -v "$1" >/dev/null 2>&1 || die "'$1' is required but not installed."; } + +urlencode() { python3 -c "import urllib.parse,sys; print(urllib.parse.quote(sys.argv[1]))" "$1"; } + +api_call() { + local method="$1" path="$2" data="${3:-}" + local url="${DATABRICKS_HOST%/}/api/2.1${path}" + if [[ -n "$data" ]]; then + curl -sSf -X "$method" \ + -H "Authorization: Bearer ${DATABRICKS_TOKEN}" \ + -H "Content-Type: application/json" \ + -d "$data" "$url" + else + curl -sSf -X "$method" \ + -H "Authorization: Bearer ${DATABRICKS_TOKEN}" \ + "$url" + fi +} + +# ── argument parsing ─────────────────────────────────────────────────────────── +while [[ $# -gt 0 ]]; do + case "$1" in + -j|--job-id) JOB_ID="$2"; shift 2 ;; + -n|--job-name) JOB_NAME="$2"; shift 2 ;; + -p|--params) PARAMS="$2"; shift 2 ;; + -w|--wait) WAIT=true; shift ;; + -t|--timeout) TIMEOUT="$2"; shift 2 ;; + -h|--help) usage ;; + *) die "Unknown option: $1" ;; + esac +done + +# ── validation ───────────────────────────────────────────────────────────────── +require_cmd curl +require_cmd jq +require_cmd python3 + +[[ -z "${DATABRICKS_HOST:-}" ]] && die "DATABRICKS_HOST is not set." +[[ -z "${DATABRICKS_TOKEN:-}" ]] && die "DATABRICKS_TOKEN is not set." +[[ -z "$JOB_ID" && -z "$JOB_NAME" ]] && die "Provide --job-id or --job-name." +[[ -n "$JOB_ID" && -n "$JOB_NAME" ]] && die "--job-id and --job-name are mutually exclusive." + +# ── resolve job name → job ID ────────────────────────────────────────────────── +if [[ -n "$JOB_NAME" ]]; then + echo "Looking up job ID for name: '${JOB_NAME}' …" + JOBS_JSON=$(api_call GET "/jobs/list?name=$(urlencode "$JOB_NAME")") + JOB_ID=$(echo "$JOBS_JSON" | jq -r '.jobs[0].job_id // empty') + [[ -z "$JOB_ID" ]] && die "No job found with name '${JOB_NAME}'." + echo "Resolved job ID: ${JOB_ID}" +fi + +# ── trigger run ──────────────────────────────────────────────────────────────── +PAYLOAD="{\"job_id\": ${JOB_ID}}" +if [[ -n "$PARAMS" ]]; then + PAYLOAD=$(echo "$PAYLOAD" | jq --argjson p "$PARAMS" '. + {notebook_params: $p}') +fi + +echo "Triggering run for job ID ${JOB_ID} …" +RUN_RESPONSE=$(api_call POST "/jobs/run-now" "$PAYLOAD") +RUN_ID=$(echo "$RUN_RESPONSE" | jq -r '.run_id') +echo "Run ID: ${RUN_ID}" +echo "Run URL: ${DATABRICKS_HOST%/}/#job/${JOB_ID}/run/${RUN_ID}" + +# ── optional wait ────────────────────────────────────────────────────────────── +if [[ "$WAIT" == "true" ]]; then + echo "Waiting for run ${RUN_ID} to complete (timeout: ${TIMEOUT}s) …" + ELAPSED=0 + while true; do + RUN_STATE=$(api_call GET "/jobs/runs/get?run_id=${RUN_ID}" | jq -r '.state') + LIFE_CYCLE=$(echo "$RUN_STATE" | jq -r '.life_cycle_state') + RESULT_STATE=$(echo "$RUN_STATE" | jq -r '.result_state // "N/A"') + echo " [${ELAPSED}s] lifecycle=${LIFE_CYCLE} result=${RESULT_STATE}" + + if [[ "$LIFE_CYCLE" == "TERMINATED" || "$LIFE_CYCLE" == "SKIPPED" || "$LIFE_CYCLE" == "INTERNAL_ERROR" ]]; then + echo "Run finished with result state: ${RESULT_STATE}" + [[ "$RESULT_STATE" == "SUCCESS" ]] && exit 0 + exit 1 + fi + + if [[ "$ELAPSED" -ge "$TIMEOUT" ]]; then + die "Timed out after ${TIMEOUT}s waiting for run ${RUN_ID}." + fi + + sleep "$POLL_INTERVAL" + ELAPSED=$(( ELAPSED + POLL_INTERVAL )) + done +fi diff --git a/databricks/submit_notebook.sh b/databricks/submit_notebook.sh new file mode 100755 index 0000000..5b2a056 --- /dev/null +++ b/databricks/submit_notebook.sh @@ -0,0 +1,144 @@ +#!/usr/bin/env bash +# submit_notebook.sh – Run a Databricks notebook as a one-time job run. +# +# Usage: +# ./submit_notebook.sh [OPTIONS] +# +# Required environment variables (or pass via flags): +# DATABRICKS_HOST – Workspace URL, e.g. https://.azuredatabricks.net +# DATABRICKS_TOKEN – Personal-access token (or SP OAuth token) +# +# Options: +# -n, --notebook-path Absolute workspace path to the notebook (required) +# -c, --cluster-id Existing cluster ID to run on +# --new-cluster New-cluster spec JSON (mutually exclusive with -c) +# -p, --params Notebook parameters as a JSON object (optional) +# -r, --run-name Human-readable run name (optional) +# -w, --wait Block until the run finishes and exit with its result code +# -t, --timeout Polling timeout when --wait is set (default: 3600) +# -h, --help Show this help message +# +# Examples: +# DATABRICKS_HOST=https://adb-xxx.azuredatabricks.net \ +# DATABRICKS_TOKEN=dapi... \ +# ./submit_notebook.sh \ +# --notebook-path /Shared/etl/my_notebook \ +# --cluster-id 1234-567890-abc12345 \ +# --params '{"env":"prod","date":"2024-01-01"}' \ +# --wait + +set -euo pipefail + +# ── defaults ─────────────────────────────────────────────────────────────────── +WAIT=false +TIMEOUT=3600 +POLL_INTERVAL=15 +NOTEBOOK_PATH="" +CLUSTER_ID="" +NEW_CLUSTER_JSON="" +PARAMS="" +RUN_NAME="notebook_run_$(date +%Y%m%d_%H%M%S)" + +# ── helpers ──────────────────────────────────────────────────────────────────── +usage() { + grep '^#' "$0" | sed 's/^# \{0,1\}//' + exit 0 +} + +die() { echo "ERROR: $*" >&2; exit 1; } + +require_cmd() { command -v "$1" >/dev/null 2>&1 || die "'$1' is required but not installed."; } + +api_call() { + local method="$1" path="$2" data="${3:-}" + local url="${DATABRICKS_HOST%/}/api/2.1${path}" + if [[ -n "$data" ]]; then + curl -sSf -X "$method" \ + -H "Authorization: Bearer ${DATABRICKS_TOKEN}" \ + -H "Content-Type: application/json" \ + -d "$data" "$url" + else + curl -sSf -X "$method" \ + -H "Authorization: Bearer ${DATABRICKS_TOKEN}" \ + "$url" + fi +} + +# ── argument parsing ─────────────────────────────────────────────────────────── +while [[ $# -gt 0 ]]; do + case "$1" in + -n|--notebook-path) NOTEBOOK_PATH="$2"; shift 2 ;; + -c|--cluster-id) CLUSTER_ID="$2"; shift 2 ;; + --new-cluster) NEW_CLUSTER_JSON="$2"; shift 2 ;; + -p|--params) PARAMS="$2"; shift 2 ;; + -r|--run-name) RUN_NAME="$2"; shift 2 ;; + -w|--wait) WAIT=true; shift ;; + -t|--timeout) TIMEOUT="$2"; shift 2 ;; + -h|--help) usage ;; + *) die "Unknown option: $1" ;; + esac +done + +# ── validation ───────────────────────────────────────────────────────────────── +require_cmd curl +require_cmd jq + +[[ -z "${DATABRICKS_HOST:-}" ]] && die "DATABRICKS_HOST is not set." +[[ -z "${DATABRICKS_TOKEN:-}" ]] && die "DATABRICKS_TOKEN is not set." +[[ -z "$NOTEBOOK_PATH" ]] && die "--notebook-path is required." +[[ -n "$CLUSTER_ID" && -n "$NEW_CLUSTER_JSON" ]] && die "--cluster-id and --new-cluster are mutually exclusive." +[[ -z "$CLUSTER_ID" && -z "$NEW_CLUSTER_JSON" ]] && die "Provide --cluster-id or --new-cluster." + +# ── build cluster spec ───────────────────────────────────────────────────────── +if [[ -n "$CLUSTER_ID" ]]; then + CLUSTER_SPEC="{\"existing_cluster_id\": \"${CLUSTER_ID}\"}" +else + CLUSTER_SPEC="{\"new_cluster\": ${NEW_CLUSTER_JSON}}" +fi + +# ── build notebook task ──────────────────────────────────────────────────────── +NOTEBOOK_TASK="{\"notebook_path\": \"${NOTEBOOK_PATH}\"" +if [[ -n "$PARAMS" ]]; then + NOTEBOOK_TASK="${NOTEBOOK_TASK}, \"base_parameters\": ${PARAMS}" +fi +NOTEBOOK_TASK="${NOTEBOOK_TASK}}" + +# ── assemble payload ─────────────────────────────────────────────────────────── +PAYLOAD=$(jq -n \ + --arg run_name "$RUN_NAME" \ + --argjson cluster_spec "$CLUSTER_SPEC" \ + --argjson notebook_task "$NOTEBOOK_TASK" \ + '$cluster_spec + {run_name: $run_name, notebook_task: $notebook_task}') + +# ── submit run ───────────────────────────────────────────────────────────────── +echo "Submitting notebook run: '${RUN_NAME}' …" +echo " Notebook: ${NOTEBOOK_PATH}" +RUN_RESPONSE=$(api_call POST "/jobs/runs/submit" "$PAYLOAD") +RUN_ID=$(echo "$RUN_RESPONSE" | jq -r '.run_id') +echo "Run ID: ${RUN_ID}" +echo "Run URL: ${DATABRICKS_HOST%/}/#job/runs/${RUN_ID}" + +# ── optional wait ────────────────────────────────────────────────────────────── +if [[ "$WAIT" == "true" ]]; then + echo "Waiting for run ${RUN_ID} to complete (timeout: ${TIMEOUT}s) …" + ELAPSED=0 + while true; do + RUN_STATE=$(api_call GET "/jobs/runs/get?run_id=${RUN_ID}" | jq -r '.state') + LIFE_CYCLE=$(echo "$RUN_STATE" | jq -r '.life_cycle_state') + RESULT_STATE=$(echo "$RUN_STATE" | jq -r '.result_state // "N/A"') + echo " [${ELAPSED}s] lifecycle=${LIFE_CYCLE} result=${RESULT_STATE}" + + if [[ "$LIFE_CYCLE" == "TERMINATED" || "$LIFE_CYCLE" == "SKIPPED" || "$LIFE_CYCLE" == "INTERNAL_ERROR" ]]; then + echo "Run finished with result state: ${RESULT_STATE}" + [[ "$RESULT_STATE" == "SUCCESS" ]] && exit 0 + exit 1 + fi + + if [[ "$ELAPSED" -ge "$TIMEOUT" ]]; then + die "Timed out after ${TIMEOUT}s waiting for run ${RUN_ID}." + fi + + sleep "$POLL_INTERVAL" + ELAPSED=$(( ELAPSED + POLL_INTERVAL )) + done +fi diff --git a/dataproc/README.md b/dataproc/README.md new file mode 100644 index 0000000..3d299f0 --- /dev/null +++ b/dataproc/README.md @@ -0,0 +1,138 @@ +# Dataproc Execution Scripts + +Shell scripts to manage **Google Cloud Dataproc** clusters and submit Spark/PySpark/Hadoop jobs via the `gcloud` CLI. + +## Prerequisites + +| Tool | Notes | +|------|-------| +| `gcloud` CLI | Install from https://cloud.google.com/sdk/docs/install | +| Authenticated session | Run `gcloud auth login` or use a service account via `GOOGLE_APPLICATION_CREDENTIALS` | + +Set the following environment variables before running any script: + +```bash +export GCP_PROJECT="my-gcp-project" +export GCP_REGION="us-central1" +``` + +--- + +## Scripts + +### `create_cluster.sh` – Create a Dataproc cluster + +``` +Usage: ./create_cluster.sh [OPTIONS] + +Options: + -p, --project GCP project ID + -r, --region GCP region + -z, --zone GCP zone (optional) + -n, --cluster-name Cluster name (required) + -i, --image-version Dataproc image version (default: 2.1-debian11) + -m, --master-machine Master machine type (default: n1-standard-4) + -M, --master-disk-size Master disk in GB (default: 100) + -w, --num-workers Number of workers (default: 2) + --worker-machine Worker machine type (default: n1-standard-4) + --worker-disk-size Worker disk in GB (default: 100) + -b, --bucket GCS staging bucket + --max-idle Auto-delete after idle (e.g. 30m) + --max-age Max cluster lifetime (e.g. 4h) + --label Cluster labels (repeatable) + --property Dataproc/Spark properties (repeatable) + -h, --help Show help +``` + +**Examples:** + +```bash +# Create a basic 4-worker cluster that auto-deletes after 30 minutes idle +./create_cluster.sh \ + --cluster-name etl-cluster \ + --num-workers 4 \ + --max-idle 30m \ + --label env=prod + +# Create a cluster with custom Spark properties +./create_cluster.sh \ + --cluster-name analytics \ + --num-workers 8 \ + --property spark:spark.executor.memory=8g \ + --property spark:spark.sql.shuffle.partitions=400 +``` + +--- + +### `delete_cluster.sh` – Delete a Dataproc cluster + +``` +Usage: ./delete_cluster.sh [OPTIONS] + +Options: + -p, --project GCP project ID + -r, --region GCP region + -n, --cluster-name Cluster to delete (required) + -y, --yes Skip confirmation prompt + -h, --help Show help +``` + +**Example:** + +```bash +./delete_cluster.sh --cluster-name etl-cluster --yes +``` + +--- + +### `submit_job.sh` – Submit a job to Dataproc + +``` +Usage: ./submit_job.sh [OPTIONS] -- [JOB_ARGS] + +Options: + -p, --project GCP project ID + -r, --region GCP region + -n, --cluster-name Target cluster (required) + -t, --job-type spark | pyspark | hadoop | hive | pig | presto | spark-r | spark-sql + (default: spark) + -a, --app JAR / Python / script path (gs:// or local) + -c, --class Main class (required for spark JAR jobs) + --jars Additional JARs (comma-separated) + --files Files to stage (comma-separated) + --py-files Python dependency files (comma-separated) + --archives Archives to stage (comma-separated) + -e, --executor-memory spark.executor.memory (e.g. 4g) + -x, --num-executors spark.executor.instances + --property Additional job property (repeatable) + -l, --labels Job labels (repeatable) + -w, --wait Wait for job completion (default) + --async Submit and return immediately + -h, --help Show help +``` + +**Examples:** + +```bash +# Submit a PySpark job +./submit_job.sh \ + --cluster-name my-cluster \ + --job-type pyspark \ + --app gs://my-bucket/etl.py \ + -- --date 2024-01-01 + +# Submit a Spark JAR job +./submit_job.sh \ + --cluster-name my-cluster \ + --job-type spark \ + --app gs://my-bucket/myapp.jar \ + --class com.example.Main \ + --executor-memory 4g \ + --num-executors 8 + +# Submit a Hive query +./submit_job.sh \ + --cluster-name my-cluster \ + --job-type hive \ + --app gs://my-bucket/query.hql +``` diff --git a/dataproc/create_cluster.sh b/dataproc/create_cluster.sh new file mode 100755 index 0000000..358bde9 --- /dev/null +++ b/dataproc/create_cluster.sh @@ -0,0 +1,137 @@ +#!/usr/bin/env bash +# create_cluster.sh – Create a Google Cloud Dataproc cluster. +# +# Usage: +# ./create_cluster.sh [OPTIONS] +# +# Prerequisites: +# - gcloud CLI authenticated (run: gcloud auth login or use a service account) +# - Required variables: GCP_PROJECT, GCP_REGION (or pass via flags) +# +# Options: +# -p, --project GCP project ID (default: $GCP_PROJECT) +# -r, --region GCP region, e.g. us-central1 (default: $GCP_REGION) +# -z, --zone GCP zone (optional; if omitted, auto-selected) +# -n, --cluster-name Cluster name (required) +# -i, --image-version Dataproc image version (default: 2.1-debian11) +# -m, --master-machine Master machine type (default: n1-standard-4) +# -M, --master-disk-size Master boot disk size in GB (default: 100) +# -w, --num-workers Number of worker nodes (default: 2) +# --worker-machine Worker machine type (default: n1-standard-4) +# --worker-disk-size Worker boot disk size in GB (default: 100) +# -b, --bucket GCS staging bucket (optional) +# --max-idle Auto-delete after idle, e.g. 30m (optional) +# --max-age Max cluster age, e.g. 4h (optional) +# --label Label to attach to the cluster (repeatable) +# --property Dataproc/Spark property, e.g. spark:spark.executor.memory=4g (repeatable) +# -h, --help Show this help message +# +# Examples: +# GCP_PROJECT=my-project GCP_REGION=us-central1 \ +# ./create_cluster.sh --cluster-name my-cluster --num-workers 4 +# +# ./create_cluster.sh \ +# --project my-project --region us-central1 \ +# --cluster-name etl-cluster \ +# --image-version 2.1-debian11 \ +# --num-workers 4 \ +# --max-idle 30m \ +# --label env=prod --label team=data-eng \ +# --property spark:spark.executor.memory=4g + +set -euo pipefail + +# ── defaults ─────────────────────────────────────────────────────────────────── +IMAGE_VERSION="2.1-debian11" +MASTER_MACHINE="n1-standard-4" +MASTER_DISK_SIZE=100 +NUM_WORKERS=2 +WORKER_MACHINE="n1-standard-4" +WORKER_DISK_SIZE=100 +CLUSTER_NAME="" +BUCKET="" +ZONE="" +MAX_IDLE="" +MAX_AGE="" +LABELS=() +PROPERTIES=() + +# ── helpers ──────────────────────────────────────────────────────────────────── +usage() { + grep '^#' "$0" | sed 's/^# \{0,1\}//' + exit 0 +} + +die() { echo "ERROR: $*" >&2; exit 1; } + +require_cmd() { command -v "$1" >/dev/null 2>&1 || die "'$1' is required but not installed."; } + +# ── argument parsing ─────────────────────────────────────────────────────────── +while [[ $# -gt 0 ]]; do + case "$1" in + -p|--project) GCP_PROJECT="$2"; shift 2 ;; + -r|--region) GCP_REGION="$2"; shift 2 ;; + -z|--zone) ZONE="$2"; shift 2 ;; + -n|--cluster-name) CLUSTER_NAME="$2"; shift 2 ;; + -i|--image-version) IMAGE_VERSION="$2"; shift 2 ;; + -m|--master-machine) MASTER_MACHINE="$2"; shift 2 ;; + -M|--master-disk-size) MASTER_DISK_SIZE="$2"; shift 2 ;; + -w|--num-workers) NUM_WORKERS="$2"; shift 2 ;; + --worker-machine) WORKER_MACHINE="$2"; shift 2 ;; + --worker-disk-size) WORKER_DISK_SIZE="$2"; shift 2 ;; + -b|--bucket) BUCKET="$2"; shift 2 ;; + --max-idle) MAX_IDLE="$2"; shift 2 ;; + --max-age) MAX_AGE="$2"; shift 2 ;; + --label) LABELS+=("$2"); shift 2 ;; + --property) PROPERTIES+=("$2"); shift 2 ;; + -h|--help) usage ;; + *) die "Unknown option: $1" ;; + esac +done + +# ── validation ───────────────────────────────────────────────────────────────── +require_cmd gcloud + +: "${GCP_PROJECT:?GCP_PROJECT is not set. Use --project or export GCP_PROJECT.}" +: "${GCP_REGION:?GCP_REGION is not set. Use --region or export GCP_REGION.}" +[[ -z "$CLUSTER_NAME" ]] && die "--cluster-name is required." + +# ── build gcloud command ─────────────────────────────────────────────────────── +CMD=( + gcloud dataproc clusters create "$CLUSTER_NAME" + --project "$GCP_PROJECT" + --region "$GCP_REGION" + --image-version "$IMAGE_VERSION" + --master-machine-type "$MASTER_MACHINE" + --master-boot-disk-size "${MASTER_DISK_SIZE}GB" + --num-workers "$NUM_WORKERS" + --worker-machine-type "$WORKER_MACHINE" + --worker-boot-disk-size "${WORKER_DISK_SIZE}GB" +) + +[[ -n "$ZONE" ]] && CMD+=(--zone "$ZONE") +[[ -n "$BUCKET" ]] && CMD+=(--bucket "$BUCKET") +[[ -n "$MAX_IDLE" ]] && CMD+=(--max-idle "$MAX_IDLE") +[[ -n "$MAX_AGE" ]] && CMD+=(--max-age "$MAX_AGE") + +if [[ ${#LABELS[@]} -gt 0 ]]; then + LABEL_STR=$(IFS=,; echo "${LABELS[*]}") + CMD+=(--labels "$LABEL_STR") +fi + +if [[ ${#PROPERTIES[@]} -gt 0 ]]; then + PROP_STR=$(IFS=,; echo "${PROPERTIES[*]}") + CMD+=(--properties "$PROP_STR") +fi + +# ── print and run ────────────────────────────────────────────────────────────── +echo "Creating Dataproc cluster '${CLUSTER_NAME}' in ${GCP_PROJECT}/${GCP_REGION} …" +echo " Image: ${IMAGE_VERSION}" +echo " Master: ${MASTER_MACHINE} / ${MASTER_DISK_SIZE} GB" +echo " Workers: ${NUM_WORKERS} × ${WORKER_MACHINE} / ${WORKER_DISK_SIZE} GB" +[[ -n "$MAX_IDLE" ]] && echo " Max idle: ${MAX_IDLE}" +[[ -n "$MAX_AGE" ]] && echo " Max age: ${MAX_AGE}" +echo "" +echo "Running: ${CMD[*]}" +echo "" +exec "${CMD[@]}" diff --git a/dataproc/delete_cluster.sh b/dataproc/delete_cluster.sh new file mode 100755 index 0000000..f36a6f3 --- /dev/null +++ b/dataproc/delete_cluster.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash +# delete_cluster.sh – Delete a Google Cloud Dataproc cluster. +# +# Usage: +# ./delete_cluster.sh [OPTIONS] +# +# Options: +# -p, --project GCP project ID (default: $GCP_PROJECT) +# -r, --region GCP region (default: $GCP_REGION) +# -n, --cluster-name Cluster name to delete (required) +# -y, --yes Skip confirmation prompt +# -h, --help Show this help message +# +# Examples: +# GCP_PROJECT=my-project GCP_REGION=us-central1 \ +# ./delete_cluster.sh --cluster-name my-cluster --yes + +set -euo pipefail + +# ── defaults ─────────────────────────────────────────────────────────────────── +CLUSTER_NAME="" +SKIP_CONFIRM=false + +# ── helpers ──────────────────────────────────────────────────────────────────── +usage() { + grep '^#' "$0" | sed 's/^# \{0,1\}//' + exit 0 +} + +die() { echo "ERROR: $*" >&2; exit 1; } + +require_cmd() { command -v "$1" >/dev/null 2>&1 || die "'$1' is required but not installed."; } + +# ── argument parsing ─────────────────────────────────────────────────────────── +while [[ $# -gt 0 ]]; do + case "$1" in + -p|--project) GCP_PROJECT="$2"; shift 2 ;; + -r|--region) GCP_REGION="$2"; shift 2 ;; + -n|--cluster-name) CLUSTER_NAME="$2"; shift 2 ;; + -y|--yes) SKIP_CONFIRM=true; shift ;; + -h|--help) usage ;; + *) die "Unknown option: $1" ;; + esac +done + +# ── validation ───────────────────────────────────────────────────────────────── +require_cmd gcloud + +: "${GCP_PROJECT:?GCP_PROJECT is not set. Use --project or export GCP_PROJECT.}" +: "${GCP_REGION:?GCP_REGION is not set. Use --region or export GCP_REGION.}" +[[ -z "$CLUSTER_NAME" ]] && die "--cluster-name is required." + +# ── confirmation ─────────────────────────────────────────────────────────────── +if [[ "$SKIP_CONFIRM" == "false" ]]; then + read -r -p "Delete Dataproc cluster '${CLUSTER_NAME}' in ${GCP_PROJECT}/${GCP_REGION}? [y/N] " REPLY + [[ "$REPLY" =~ ^[Yy]$ ]] || { echo "Aborted."; exit 0; } +fi + +# ── delete cluster ───────────────────────────────────────────────────────────── +echo "Deleting cluster '${CLUSTER_NAME}' …" +exec gcloud dataproc clusters delete "$CLUSTER_NAME" \ + --project "$GCP_PROJECT" \ + --region "$GCP_REGION" \ + --quiet diff --git a/dataproc/submit_job.sh b/dataproc/submit_job.sh new file mode 100755 index 0000000..92bbd30 --- /dev/null +++ b/dataproc/submit_job.sh @@ -0,0 +1,166 @@ +#!/usr/bin/env bash +# submit_job.sh – Submit a Spark/PySpark/Hadoop job to a Google Cloud Dataproc cluster. +# +# Usage: +# ./submit_job.sh [OPTIONS] -- [JOB_ARGS] +# +# Prerequisites: +# - gcloud CLI authenticated +# - Required variables: GCP_PROJECT, GCP_REGION (or pass via flags) +# +# Options: +# -p, --project GCP project ID (default: $GCP_PROJECT) +# -r, --region GCP region (default: $GCP_REGION) +# -n, --cluster-name Target Dataproc cluster name (required) +# -t, --job-type Job type: spark | pyspark | hadoop | hive | pig | presto | spark-r | spark-sql +# (default: spark) +# -a, --app JAR / Python / script file (gs:// or local path) +# -c, --class Main class (required for spark JAR jobs) +# --jars Comma-separated list of additional JARs (optional) +# --files Comma-separated list of files to stage (optional) +# --py-files Comma-separated Python dependency files (optional) +# --archives Comma-separated archives to stage (optional) +# -e, --executor-memory spark.executor.memory, e.g. 4g (optional) +# -x, --num-executors spark.executor.instances (optional) +# --property Additional Spark/job property (repeatable) +# -l, --labels Labels for this job (repeatable) +# -w, --wait Block until the job finishes (default: true) +# --async Submit and return immediately (don't wait) +# -h, --help Show this help message +# +# Everything after -- is passed as job arguments. +# +# Examples: +# # Submit a PySpark job +# GCP_PROJECT=my-project GCP_REGION=us-central1 \ +# ./submit_job.sh \ +# --cluster-name my-cluster \ +# --job-type pyspark \ +# --app gs://my-bucket/etl.py \ +# -- --date 2024-01-01 +# +# # Submit a Spark JAR job +# ./submit_job.sh \ +# --project my-project --region us-central1 \ +# --cluster-name my-cluster \ +# --job-type spark \ +# --app gs://my-bucket/myapp.jar \ +# --class com.example.Main \ +# --executor-memory 4g \ +# --num-executors 8 \ +# --property spark.sql.shuffle.partitions=200 + +set -euo pipefail + +# ── defaults ─────────────────────────────────────────────────────────────────── +JOB_TYPE="spark" +CLUSTER_NAME="" +APP="" +MAIN_CLASS="" +EXTRA_JARS="" +EXTRA_FILES="" +PY_FILES="" +ARCHIVES="" +EXECUTOR_MEMORY="" +NUM_EXECUTORS="" +PROPERTIES=() +LABELS=() +WAIT_MODE="--wait" +JOB_ARGS=() + +# ── helpers ──────────────────────────────────────────────────────────────────── +usage() { + grep '^#' "$0" | sed 's/^# \{0,1\}//' + exit 0 +} + +die() { echo "ERROR: $*" >&2; exit 1; } + +require_cmd() { command -v "$1" >/dev/null 2>&1 || die "'$1' is required but not installed."; } + +# ── argument parsing ─────────────────────────────────────────────────────────── +while [[ $# -gt 0 ]]; do + case "$1" in + -p|--project) GCP_PROJECT="$2"; shift 2 ;; + -r|--region) GCP_REGION="$2"; shift 2 ;; + -n|--cluster-name) CLUSTER_NAME="$2"; shift 2 ;; + -t|--job-type) JOB_TYPE="$2"; shift 2 ;; + -a|--app) APP="$2"; shift 2 ;; + -c|--class) MAIN_CLASS="$2"; shift 2 ;; + --jars) EXTRA_JARS="$2"; shift 2 ;; + --files) EXTRA_FILES="$2"; shift 2 ;; + --py-files) PY_FILES="$2"; shift 2 ;; + --archives) ARCHIVES="$2"; shift 2 ;; + -e|--executor-memory) EXECUTOR_MEMORY="$2"; shift 2 ;; + -x|--num-executors) NUM_EXECUTORS="$2"; shift 2 ;; + --property) PROPERTIES+=("$2"); shift 2 ;; + -l|--labels) LABELS+=("$2"); shift 2 ;; + -w|--wait) WAIT_MODE="--wait"; shift ;; + --async) WAIT_MODE="--async"; shift ;; + -h|--help) usage ;; + --) shift; JOB_ARGS=("$@"); break ;; + *) die "Unknown option: $1" ;; + esac +done + +# ── validation ───────────────────────────────────────────────────────────────── +require_cmd gcloud + +: "${GCP_PROJECT:?GCP_PROJECT is not set. Use --project or export GCP_PROJECT.}" +: "${GCP_REGION:?GCP_REGION is not set. Use --region or export GCP_REGION.}" +[[ -z "$CLUSTER_NAME" ]] && die "--cluster-name is required." + +VALID_TYPES=("spark" "pyspark" "hadoop" "hive" "pig" "presto" "spark-r" "spark-sql") +VALID=false +for t in "${VALID_TYPES[@]}"; do [[ "$t" == "$JOB_TYPE" ]] && VALID=true && break; done +[[ "$VALID" == "false" ]] && die "Invalid --job-type '${JOB_TYPE}'. Valid: ${VALID_TYPES[*]}" + +# ── build gcloud command ─────────────────────────────────────────────────────── +CMD=( + gcloud dataproc jobs submit "$JOB_TYPE" + --project "$GCP_PROJECT" + --region "$GCP_REGION" + --cluster "$CLUSTER_NAME" + "$WAIT_MODE" +) + +[[ -n "$APP" ]] && CMD+=("$APP") +[[ -n "$MAIN_CLASS" ]] && CMD+=(--class "$MAIN_CLASS") +[[ -n "$EXTRA_JARS" ]] && CMD+=(--jars "$EXTRA_JARS") +[[ -n "$EXTRA_FILES" ]] && CMD+=(--files "$EXTRA_FILES") +[[ -n "$PY_FILES" ]] && CMD+=(--py-files "$PY_FILES") +[[ -n "$ARCHIVES" ]] && CMD+=(--archives "$ARCHIVES") + +# ── collect all Spark/job properties into one --properties flag ──────────────── +ALL_PROPS=() +if [[ -n "$EXECUTOR_MEMORY" || -n "$NUM_EXECUTORS" ]]; then + [[ -n "$EXECUTOR_MEMORY" ]] && ALL_PROPS+=("spark:spark.executor.memory=${EXECUTOR_MEMORY}") + [[ -n "$NUM_EXECUTORS" ]] && ALL_PROPS+=("spark:spark.executor.instances=${NUM_EXECUTORS}") +fi +for prop in "${PROPERTIES[@]+"${PROPERTIES[@]}"}"; do + ALL_PROPS+=("$prop") +done +if [[ ${#ALL_PROPS[@]} -gt 0 ]]; then + PROPS_STR=$(IFS=,; echo "${ALL_PROPS[*]}") + CMD+=(--properties "$PROPS_STR") +fi + +if [[ ${#LABELS[@]} -gt 0 ]]; then + LABEL_STR=$(IFS=,; echo "${LABELS[*]}") + CMD+=(--labels "$LABEL_STR") +fi + +if [[ ${#JOB_ARGS[@]} -gt 0 ]]; then + CMD+=(-- "${JOB_ARGS[@]}") +fi + +# ── print and run ────────────────────────────────────────────────────────────── +echo "Submitting ${JOB_TYPE} job to Dataproc cluster '${CLUSTER_NAME}' …" +echo " Project: ${GCP_PROJECT}" +echo " Region: ${GCP_REGION}" +[[ -n "$APP" ]] && echo " App: ${APP}" +[[ -n "$MAIN_CLASS" ]] && echo " Class: ${MAIN_CLASS}" +echo "" +echo "Running: ${CMD[*]}" +echo "" +exec "${CMD[@]}" diff --git a/spark-standalone/README.md b/spark-standalone/README.md new file mode 100644 index 0000000..9a19351 --- /dev/null +++ b/spark-standalone/README.md @@ -0,0 +1,109 @@ +# Spark Standalone Execution Scripts + +Shell scripts to manage a local/single-node **Spark Standalone** cluster and submit Spark applications. + +## Prerequisites + +| Requirement | Notes | +|-------------|-------| +| Apache Spark | Set `SPARK_HOME` to the Spark installation directory | +| Java 8 / 11 / 17 | Required by Spark | + +Set `SPARK_HOME` before running any script: + +```bash +export SPARK_HOME=/opt/spark # or wherever Spark is installed +``` + +--- + +## Scripts + +### `start_cluster.sh` – Start master + workers + +Starts a Spark master process and one or more local worker processes. + +``` +Usage: ./start_cluster.sh [OPTIONS] + +Options: + -m, --master-host Hostname/IP for the master (default: localhost) + -M, --master-port Master service port (default: 7077) + -u, --master-ui-port Master Web UI port (default: 8080) + -w, --workers Number of local workers (default: 1) + -c, --worker-cores CPU cores per worker (default: all available) + -r, --worker-memory Memory per worker, e.g. 4g + -s, --spark-home Override SPARK_HOME + -h, --help Show help +``` + +**Examples:** + +```bash +# Start 1 master + 2 workers (4 cores, 8 GB each) +./start_cluster.sh --workers 2 --worker-cores 4 --worker-memory 8g + +# Bind master to a specific IP +./start_cluster.sh --master-host 192.168.1.10 +``` + +--- + +### `stop_cluster.sh` – Stop all workers and master + +``` +Usage: ./stop_cluster.sh [OPTIONS] + +Options: + -s, --spark-home Override SPARK_HOME + -h, --help Show help +``` + +**Example:** + +```bash +./stop_cluster.sh +``` + +--- + +### `submit_job.sh` – Submit a Spark application + +Wraps `spark-submit` with sensible defaults for standalone mode. + +``` +Usage: ./submit_job.sh [OPTIONS] -- [APP_ARGS] + +Options: + -m, --master Spark master URL (default: spark://localhost:7077) + -a, --app JAR or Python file (required) + -c, --class Main class (required for JAR jobs) + -n, --name Application name + -e, --executor-cores Cores per executor (default: 1) + -r, --executor-memory Memory per executor (default: 1g) + -x, --num-executors Number of executors (default: 2) + -d, --deploy-mode client | cluster (default: client) + -s, --spark-home Override SPARK_HOME + --conf Additional Spark conf (repeatable) + -h, --help Show help +``` + +**Examples:** + +```bash +# Submit a Python job +./submit_job.sh --app /path/to/etl.py + +# Submit a JAR job with application arguments +./submit_job.sh \ + --app /path/to/myapp.jar \ + --class com.example.Main \ + --executor-memory 4g \ + --num-executors 4 \ + -- --date 2024-01-01 --env prod + +# Extra Spark configuration +./submit_job.sh --app etl.py \ + --conf spark.sql.shuffle.partitions=200 \ + --conf spark.executor.extraJavaOptions="-XX:+UseG1GC" +``` diff --git a/spark-standalone/start_cluster.sh b/spark-standalone/start_cluster.sh new file mode 100755 index 0000000..320e54a --- /dev/null +++ b/spark-standalone/start_cluster.sh @@ -0,0 +1,92 @@ +#!/usr/bin/env bash +# start_cluster.sh – Start a Spark Standalone cluster (master + one or more workers). +# +# Usage: +# ./start_cluster.sh [OPTIONS] +# +# Options: +# -m, --master-host Hostname/IP to bind the master to (default: localhost) +# -M, --master-port Port for the Spark master service (default: 7077) +# -u, --master-ui-port Port for the master Web UI (default: 8080) +# -w, --workers Number of worker processes to start locally (default: 1) +# -c, --worker-cores CPU cores per worker (default: all available) +# -r, --worker-memory Memory per worker, e.g. 4g (default: system decides) +# -s, --spark-home SPARK_HOME directory (default: $SPARK_HOME env var) +# -h, --help Show this help message +# +# Examples: +# # Start master + 2 workers with 4 cores and 8 GB each +# SPARK_HOME=/opt/spark ./start_cluster.sh --workers 2 --worker-cores 4 --worker-memory 8g +# +# # Start master on a specific host +# ./start_cluster.sh --master-host 192.168.1.10 --workers 1 + +set -euo pipefail + +# ── defaults ─────────────────────────────────────────────────────────────────── +MASTER_HOST="localhost" +MASTER_PORT=7077 +MASTER_UI_PORT=8080 +NUM_WORKERS=1 +WORKER_CORES="" +WORKER_MEMORY="" + +# ── helpers ──────────────────────────────────────────────────────────────────── +usage() { + grep '^#' "$0" | sed 's/^# \{0,1\}//' + exit 0 +} + +die() { echo "ERROR: $*" >&2; exit 1; } + +# ── argument parsing ─────────────────────────────────────────────────────────── +while [[ $# -gt 0 ]]; do + case "$1" in + -m|--master-host) MASTER_HOST="$2"; shift 2 ;; + -M|--master-port) MASTER_PORT="$2"; shift 2 ;; + -u|--master-ui-port) MASTER_UI_PORT="$2"; shift 2 ;; + -w|--workers) NUM_WORKERS="$2"; shift 2 ;; + -c|--worker-cores) WORKER_CORES="$2"; shift 2 ;; + -r|--worker-memory) WORKER_MEMORY="$2"; shift 2 ;; + -s|--spark-home) SPARK_HOME="$2"; shift 2 ;; + -h|--help) usage ;; + *) die "Unknown option: $1" ;; + esac +done + +# ── resolve SPARK_HOME ───────────────────────────────────────────────────────── +: "${SPARK_HOME:?SPARK_HOME is not set. Use --spark-home or export SPARK_HOME.}" +[[ -d "$SPARK_HOME" ]] || die "SPARK_HOME does not exist: $SPARK_HOME" + +SBIN="${SPARK_HOME}/sbin" +[[ -x "${SBIN}/start-master.sh" ]] || die "Spark sbin scripts not found in ${SBIN}" + +# ── start master ─────────────────────────────────────────────────────────────── +echo "Starting Spark master on ${MASTER_HOST}:${MASTER_PORT} (UI: ${MASTER_UI_PORT}) …" +SPARK_MASTER_HOST="$MASTER_HOST" \ +SPARK_MASTER_PORT="$MASTER_PORT" \ +SPARK_MASTER_WEBUI_PORT="$MASTER_UI_PORT" \ +"${SBIN}/start-master.sh" + +MASTER_URL="spark://${MASTER_HOST}:${MASTER_PORT}" +echo "Master URL: ${MASTER_URL}" +echo "Master UI: http://${MASTER_HOST}:${MASTER_UI_PORT}" + +# ── start workers ────────────────────────────────────────────────────────────── +WORKER_OPTS=() +[[ -n "$WORKER_CORES" ]] && WORKER_OPTS+=(-c "$WORKER_CORES") +[[ -n "$WORKER_MEMORY" ]] && WORKER_OPTS+=(-m "$WORKER_MEMORY") + +echo "Starting ${NUM_WORKERS} worker(s) …" +for i in $(seq 1 "$NUM_WORKERS"); do + echo " Starting worker ${i}/${NUM_WORKERS} …" + SPARK_WORKER_WEBUI_PORT=$(( 8081 + i - 1 )) \ + "${SBIN}/start-worker.sh" "${WORKER_OPTS[@]+"${WORKER_OPTS[@]}"}" "$MASTER_URL" +done + +echo "" +echo "Spark standalone cluster is running." +echo " Master: ${MASTER_URL}" +echo " Workers: ${NUM_WORKERS}" +echo "" +echo "To stop the cluster, run: ./stop_cluster.sh --spark-home ${SPARK_HOME}" diff --git a/spark-standalone/stop_cluster.sh b/spark-standalone/stop_cluster.sh new file mode 100755 index 0000000..0a95aee --- /dev/null +++ b/spark-standalone/stop_cluster.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash +# stop_cluster.sh – Stop a running Spark Standalone cluster (workers + master). +# +# Usage: +# ./stop_cluster.sh [OPTIONS] +# +# Options: +# -s, --spark-home SPARK_HOME directory (default: $SPARK_HOME env var) +# -h, --help Show this help message +# +# Examples: +# SPARK_HOME=/opt/spark ./stop_cluster.sh +# ./stop_cluster.sh --spark-home /opt/spark + +set -euo pipefail + +# ── helpers ──────────────────────────────────────────────────────────────────── +usage() { + grep '^#' "$0" | sed 's/^# \{0,1\}//' + exit 0 +} + +die() { echo "ERROR: $*" >&2; exit 1; } + +# ── argument parsing ─────────────────────────────────────────────────────────── +while [[ $# -gt 0 ]]; do + case "$1" in + -s|--spark-home) SPARK_HOME="$2"; shift 2 ;; + -h|--help) usage ;; + *) die "Unknown option: $1" ;; + esac +done + +# ── resolve SPARK_HOME ───────────────────────────────────────────────────────── +: "${SPARK_HOME:?SPARK_HOME is not set. Use --spark-home or export SPARK_HOME.}" +[[ -d "$SPARK_HOME" ]] || die "SPARK_HOME does not exist: $SPARK_HOME" + +SBIN="${SPARK_HOME}/sbin" + +# ── stop workers first, then master ──────────────────────────────────────────── +echo "Stopping all Spark workers …" +if [[ -x "${SBIN}/stop-worker.sh" ]]; then + "${SBIN}/stop-worker.sh" || true +else + echo " stop-worker.sh not found, skipping." +fi + +echo "Stopping Spark master …" +if [[ -x "${SBIN}/stop-master.sh" ]]; then + "${SBIN}/stop-master.sh" || true +else + echo " stop-master.sh not found, skipping." +fi + +echo "Spark standalone cluster stopped." diff --git a/spark-standalone/submit_job.sh b/spark-standalone/submit_job.sh new file mode 100755 index 0000000..ebeca9e --- /dev/null +++ b/spark-standalone/submit_job.sh @@ -0,0 +1,126 @@ +#!/usr/bin/env bash +# submit_job.sh – Submit a Spark application to a Standalone cluster via spark-submit. +# +# Usage: +# ./submit_job.sh [OPTIONS] -- [EXTRA_SPARK_ARGS] +# +# Options: +# -m, --master Spark master URL (default: spark://localhost:7077) +# -a, --app Path to application JAR / Python file (required) +# -c, --class Main class (required for JAR jobs) +# -n, --name Application name (default: basename of app file) +# -e, --executor-cores Cores per executor (default: 1) +# -r, --executor-memory Memory per executor, e.g. 2g (default: 1g) +# -x, --num-executors Number of executors (default: 2) +# -d, --deploy-mode client | cluster (default: client) +# -s, --spark-home SPARK_HOME directory (default: $SPARK_HOME) +# --conf Additional Spark configuration (repeatable) +# -h, --help Show this help message +# +# Everything after -- is passed verbatim to spark-submit (e.g. application args). +# +# Examples: +# # Submit a Python app +# ./submit_job.sh --app /path/to/my_etl.py --master spark://localhost:7077 +# +# # Submit a JAR with extra app arguments +# ./submit_job.sh \ +# --app /path/to/myapp.jar \ +# --class com.example.Main \ +# --executor-memory 4g \ +# --num-executors 4 \ +# -- --date 2024-01-01 --env prod +# +# # Pass extra Spark confs +# ./submit_job.sh --app myapp.py \ +# --conf spark.sql.shuffle.partitions=200 \ +# --conf spark.executor.extraJavaOptions="-XX:+UseG1GC" + +set -euo pipefail + +# ── defaults ─────────────────────────────────────────────────────────────────── +MASTER="spark://localhost:7077" +APP="" +MAIN_CLASS="" +APP_NAME="" +EXECUTOR_CORES=1 +EXECUTOR_MEMORY="1g" +NUM_EXECUTORS=2 +DEPLOY_MODE="client" +EXTRA_CONFS=() +APP_ARGS=() + +# ── helpers ──────────────────────────────────────────────────────────────────── +usage() { + grep '^#' "$0" | sed 's/^# \{0,1\}//' + exit 0 +} + +die() { echo "ERROR: $*" >&2; exit 1; } + +require_cmd() { command -v "$1" >/dev/null 2>&1 || die "'$1' is required but not installed."; } + +# ── argument parsing ─────────────────────────────────────────────────────────── +while [[ $# -gt 0 ]]; do + case "$1" in + -m|--master) MASTER="$2"; shift 2 ;; + -a|--app) APP="$2"; shift 2 ;; + -c|--class) MAIN_CLASS="$2"; shift 2 ;; + -n|--name) APP_NAME="$2"; shift 2 ;; + -e|--executor-cores) EXECUTOR_CORES="$2"; shift 2 ;; + -r|--executor-memory) EXECUTOR_MEMORY="$2"; shift 2 ;; + -x|--num-executors) NUM_EXECUTORS="$2"; shift 2 ;; + -d|--deploy-mode) DEPLOY_MODE="$2"; shift 2 ;; + -s|--spark-home) SPARK_HOME="$2"; shift 2 ;; + --conf) EXTRA_CONFS+=("$2"); shift 2 ;; + -h|--help) usage ;; + --) shift; APP_ARGS=("$@"); break ;; + *) die "Unknown option: $1" ;; + esac +done + +# ── resolve SPARK_HOME ───────────────────────────────────────────────────────── +: "${SPARK_HOME:?SPARK_HOME is not set. Use --spark-home or export SPARK_HOME.}" +[[ -d "$SPARK_HOME" ]] || die "SPARK_HOME does not exist: $SPARK_HOME" + +SPARK_SUBMIT="${SPARK_HOME}/bin/spark-submit" +[[ -x "$SPARK_SUBMIT" ]] || die "spark-submit not found at: ${SPARK_SUBMIT}" + +# ── validation ───────────────────────────────────────────────────────────────── +[[ -z "$APP" ]] && die "--app is required." +[[ "$DEPLOY_MODE" == "client" || "$DEPLOY_MODE" == "cluster" ]] \ + || die "--deploy-mode must be 'client' or 'cluster'." + +[[ -z "$APP_NAME" ]] && APP_NAME="$(basename "$APP" | sed 's/\.[^.]*$//')" + +# ── build spark-submit command ───────────────────────────────────────────────── +CMD=( + "$SPARK_SUBMIT" + --master "$MASTER" + --deploy-mode "$DEPLOY_MODE" + --name "$APP_NAME" + --executor-cores "$EXECUTOR_CORES" + --executor-memory "$EXECUTOR_MEMORY" + --num-executors "$NUM_EXECUTORS" +) + +[[ -n "$MAIN_CLASS" ]] && CMD+=(--class "$MAIN_CLASS") + +for conf in "${EXTRA_CONFS[@]+"${EXTRA_CONFS[@]}"}"; do + CMD+=(--conf "$conf") +done + +CMD+=("$APP") +CMD+=("${APP_ARGS[@]+"${APP_ARGS[@]}"}") + +# ── print and run ────────────────────────────────────────────────────────────── +echo "Submitting Spark job to ${MASTER} …" +echo " App: ${APP}" +[[ -n "$MAIN_CLASS" ]] && echo " Class: ${MAIN_CLASS}" +echo " Name: ${APP_NAME}" +echo " Mode: ${DEPLOY_MODE}" +echo " Executors: ${NUM_EXECUTORS} × ${EXECUTOR_CORES} cores / ${EXECUTOR_MEMORY}" +echo "" +echo "Running: ${CMD[*]}" +echo "" +exec "${CMD[@]}"