diff --git a/.scripts/bk b/.scripts/bk
index 8096ccd..cea70b9 100755
--- a/.scripts/bk
+++ b/.scripts/bk
@@ -65,13 +65,10 @@ export BK_REPO_URL="https://github.com/${BK_REPO}.git"
export BK_TUTORIAL="${BK_TUTORIAL:-docs/TUTORIAL.md}" # defaults to .TUTORIAL.md; can be overwritten
export BK_BRANCH="${BK_BRANCH:-main}" # defaults to main; can be overwritten
export BK_DIR=~/${BK_GITHUB_REPOSITORY}
-export BK_INIT_SCRIPT=~/${BK_GITHUB_REPOSITORY}/bk
export BK_INITIALIZED=1
cd ~/
-pip install --quiet jinja2 nbformat nbconvert
-
if ! command -v git &> /dev/null; then
sudo apt update
sudo apt install -y git
@@ -87,25 +84,37 @@ fi
cd $BK_GITHUB_REPOSITORY
NEW_PATH=~/${BK_GITHUB_REPOSITORY}/.scripts
+PATH_EXPORT_LINE="export PATH=\${HOME}/${BK_GITHUB_REPOSITORY}/.scripts:\$PATH"
-# Check if the new path is already in the PATH
+# 1. Add to current session's PATH if missing
if [[ ":$PATH:" != *":$NEW_PATH:"* ]]; then
-echo -e "${MAGENTA}Adding $NEW_PATH to your PATH${NC}"
+ echo -e "${MAGENTA}Adding $NEW_PATH to your current session's PATH${NC}"
export PATH=${NEW_PATH}:$PATH
else
- echo -e "${GREEN}Your PATH already contains $NEW_PATH. Not adding it again.${NC}"
+ echo -e "${GREEN}Your current session's PATH already contains $NEW_PATH. Not adding it again.${NC}"
+fi
+
+# 2. Persist the PATH setting in ~/.bashrc if not already there
+if ! grep -qF "$PATH_EXPORT_LINE" ~/.bashrc ; then
+ echo -e "${MAGENTA}Adding $NEW_PATH to ~/.bashrc for future sessions.${NC}"
+ # Use '>>' to append the line to the file
+ echo "$PATH_EXPORT_LINE" >> ~/.bashrc
+else
+ echo -e "${GREEN}The permanent PATH export for $NEW_PATH is already in ~/.bashrc. Skipping.${NC}"
fi
+
unset NEW_PATH
+unset PATH_EXPORT_LINE
-echo -e "Sourcing $(readlink -f vars.sh)"
-source vars.sh
+echo -e "Sourcing $(readlink -f $BK_DIR/vars.sh)"
+source $BK_DIR/vars.sh
-if [ -f vars.local.sh ]; then
- echo -e "Sourcing $(readlink -f vars.local.sh)"
- source vars.local.sh
+if [ -f $BK_DIR/vars.local.sh ]; then
+ echo -e "Sourcing $(readlink -f $BK_DIR/vars.local.sh)"
+ source $BK_DIR/vars.local.sh
fi
-echo -e "Variables from vars.sh: PROJECT_ID=${YELLOW}$PROJECT_ID${NC} GCP_USERNAME=${YELLOW}$GCP_USERNAME${NC} REGION=${YELLOW}$REGION${NC}"
+echo -e "Variables from $BK_DIR/vars.sh: PROJECT_ID=${YELLOW}$PROJECT_ID${NC} GCP_USERNAME=${YELLOW}$GCP_USERNAME${NC} REGION=${YELLOW}$REGION${NC}"
if [ -z $PROJECT_ID ]; then
@@ -138,24 +147,6 @@ else
echo "$line" >> ~/.bashrc
fi
-## Set or update $BK_INIT_SCRIPT in ~/.bashrc
-line="export BK_INIT_SCRIPT=~/${BK_GITHUB_REPOSITORY}/.scripts/bk"
-if grep -q '^export BK_INIT_SCRIPT=' ~/.bashrc; then
- # If the line exists but differs, update it
- if ! grep -Fxq "$line" ~/.bashrc; then
- sed -i "s|^export BK_INIT_SCRIPT=.*|$line|" ~/.bashrc
- echo "Updated the existing BK_INIT_SCRIPT line in ~/.bashrc."
- fi
-else
- echo "$line" >> ~/.bashrc
-fi
-
-## Load $BK_INIT_SCRIPT in ~/.bashrc
-line='if [ -f ${BK_INIT_SCRIPT} ]; then source ${BK_INIT_SCRIPT}; fi'
-grep -qxF "$line" ~/.bashrc || echo "$line" >> ~/.bashrc
-
-unset line
-
echo
echo -e " __ --------------------------------------------------------"
echo -e " _(\ |${RED}@@${NC}| | |"
@@ -171,4 +162,4 @@ echo
if [ "$(basename ${BASH_SOURCE[0]})" != "bk" ]; then
# This script is run the first time from GitHub
bk-start
-fi
\ No newline at end of file
+fi
diff --git a/.scripts/bk-bootstrap b/.scripts/bk-bootstrap
index 14b0aae..90c9dde 100755
--- a/.scripts/bk-bootstrap
+++ b/.scripts/bk-bootstrap
@@ -58,3 +58,19 @@ for role in "${service_account_roles[@]}"; do
gcloud projects add-iam-policy-binding "$PROJECT_ID" \
--member="serviceAccount:$COMPUTE_SERVICE_ACCOUNT" --role="$role" >>/dev/null
done
+
+VERTEX_AI_CC_SERVICE_ACCOUNT="service-$PROJECT_NUMBER@gcp-sa-aiplatform-cc.iam.gserviceaccount.com"
+
+# Array of roles to grant to the Vertex AI Custom Code Service Agent
+declare -a vertex_cc_service_agent_roles=(
+ "roles/artifactregistry.reader" # AI Platform Artifact Registry Reader
+ "roles/artifactregistry.serviceAgent" # Custom Artifact Registry Service Agent
+ "roles/aiplatform.customCodeServiceAgent" # Vertex AI Custom Code Service Agent
+)
+
+# Assign roles to the Vertex AI Custom Code Service Account
+#for role in "${vertex_cc_service_agent_roles[@]}"; do
+# echo "Assigning role $role to $VERTEX_AI_CC_SERVICE_ACCOUNT in project $PROJECT_ID..."
+# gcloud projects add-iam-policy-binding "$PROJECT_ID" \
+# --member="serviceAccount:$VERTEX_AI_CC_SERVICE_ACCOUNT" --role="$role" >>/dev/null
+#done
diff --git a/.scripts/bk-render-jinja2 b/.scripts/bk-render-jinja2
index 197d86a..2f223c2 100755
--- a/.scripts/bk-render-jinja2
+++ b/.scripts/bk-render-jinja2
@@ -6,10 +6,8 @@ import json
import sys
import os
import re
-import nbformat
import base64
from functools import partial
-from nbconvert import HTMLExporter, MarkdownExporter
import jinja2
@@ -42,6 +40,9 @@ def apply_to_content(data, func):
def render_jupyter(path):
+ import nbformat
+ from nbconvert import HTMLExporter, MarkdownExporter
+
with open(path) as f:
nb = nbformat.read(f, as_version=4)
exporter = HTMLExporter()
diff --git a/.scripts/bk-start b/.scripts/bk-start
index afc1bda..047e828 100755
--- a/.scripts/bk-start
+++ b/.scripts/bk-start
@@ -1,5 +1,13 @@
#!/bin/sh
cd $BK_DIR
+
+if [ -z "$BK_INITIALIZED" ]; then
+ echo "Bootkon has not been initialized."
+ echo "Please execute: "
+ echo " . bk (including the dot)"
+ exit 1
+fi
+
bk-tutorial $BK_TUTORIAL
cloudshell open-workspace .
\ No newline at end of file
diff --git a/docs/TUTORIAL.md b/docs/TUTORIAL.md
index a59447d..57db1db 100644
--- a/docs/TUTORIAL.md
+++ b/docs/TUTORIAL.md
@@ -38,19 +38,21 @@ and set `GCP_USERNAME`, `PROJECT_ID` according to the information you received.
❗ Please do not include any whitespaces when setting these variablers.
-Please reload bootkon and make sure there are no errors printed:
+Please initialize bootkon. The next command will set environment variables in your current terminal.
```bash
. bk
```
-
-And restart the tutorial using the next command. You can also use the next command to continue bootkon in case you accidentally close the tutorial or the editor:
+Reload the tutorial window on the right-hand side of your screen.
```bash
bk-start
```
+In case you accidently close the tutorial or the editor, you can execute `bk-start` to start it again. Please make sure that you execute `. bk` in every terminal
+you open so that the environment variables are set.
+
Now, your
* `PROJECT_ID` is `{% if PROJECT_ID == "" %}None{% else %}{{ PROJECT_ID }}{% endif %}`
@@ -85,4 +87,4 @@ The authors of Data & AI Bootkon are:
Data & AI Bootkon received contributions from many people, including:
- [Christine Schulze](https://www.linkedin.com/in/christine-schulze-33822765/)
- [Daniel Quinlan](https://www.linkedin.com/in/%F0%9F%8C%8Ddaniel-quinlan-51126016/)
-- [Dinesh Sandra](https://www.linkedin.com/in/sandradinesh/)
\ No newline at end of file
+- [Dinesh Sandra](https://www.linkedin.com/in/sandradinesh/)
diff --git a/docs/labs/2_data_ingestion.md b/docs/labs/2_data_ingestion.md
index c187c6f..214903d 100644
--- a/docs/labs/2_data_ingestion.md
+++ b/docs/labs/2_data_ingestion.md
@@ -43,11 +43,12 @@ echo $CONN_SERVICE_ACCOUNT
```
Let's double check the service account.
-
+
1. Go to the [BigQuery Console](https://console.cloud.google.com/bigquery).
-2. Expand {{ PROJECT_ID }}
-3. Expand External connections
-4. Click ``us.fraud-transactions-conn``.
+2. Click Explorer
+3. Expand {{ PROJECT_ID }}
+4. Click Connections
+5. Click fraud-transactions-conn
Is the service account equivalent to the one you got from the command line?
@@ -62,10 +63,11 @@ gcloud storage buckets add-iam-policy-binding gs://{{ PROJECT_ID }}-bucket \
Let's create a data set that contains the table and the external connection to Cloud Storage.
1. Go to the [BigQuery Console](https://console.cloud.google.com/bigquery)
-2. Click the three vertical dots ⋮ next to `{{ PROJECT_ID }}` in the navigation menu
-3. Click Create dataset
-4. Enter `ml_datasets` (plural) in the ID field. Region should be multi-region US.
-5. Click Create dataset
+2. Choose Explorer
+3. Hover your mouse over {{ PROJECT_ID }}
+4. Click the three vertical dots (⋮) and go to `Create dataset`
+5. Enter `ml_datasets` (plural) in the ID field. Region should be multi-region US.
+6. Click `Create dataset`
Alternatively, you can create the data set on the command line:
```bash
@@ -73,10 +75,11 @@ bq --location=us mk -d ml_datasets
```
Next, we connect the data in Cloud Storage to BigQuery:
-1. Click + Add data
-2. Click Google Cloud Storage
-3. Select `Load to BigQuery`
-4. Enter the following details:
+1. Choose Explorer
+2. Click + Add data
+3. Click Google Cloud Storage
+4. Select `Load to BigQuery`
+5. Enter the following details:
- Create table from: `Google Cloud Storage`
- Select file: `{{ PROJECT_ID }}-bucket/data/parquet/ulb_fraud_detection/*`
- File format: `Parquet`
@@ -87,7 +90,7 @@ Next, we connect the data in Cloud Storage to BigQuery:
- Check *Create a BigLake table using a Cloud Resource connection*
- Connection ID: Select `us.fraud-transactions-conn`
- Schema: `Auto detect`
-5. Click on Create table
+6. Click on Create table
Alternatively, you can also use the command line to create the table:
@@ -99,17 +102,19 @@ bq mk --table \
Let's have a look at the data set:
1. Go to the [BigQuery Console](https://console.cloud.google.com/bigquery)
-2. Expand {{ PROJECT_ID }}
-3. Expand ml_datasets
-4. Click ``ulb_fraud_detection_biglake``
-5. Click DETAILS
+2. Choose Explorer
+3. Expand {{ PROJECT_ID }}
+4. Click Datasets
+5. Click ml_datasets
+6. Click ulb_fraud_detection_biglake
+7. Click Details
Have a look at the external data configuration. You can see the Cloud Storage bucket (`gs://...`) your data
lives in.
Let's query it:
-1. Click QUERY
+1. Click Query
2. Insert the following SQL query.
```sql
diff --git a/docs/labs/3_dataform.md b/docs/labs/3_dataform.md
index fb32960..1b3e440 100644
--- a/docs/labs/3_dataform.md
+++ b/docs/labs/3_dataform.md
@@ -209,11 +209,11 @@ Go to [Dataform](https://console.cloud.google.com/bigquery/dataform)\> ``{{ PROJECT_ID }}`` \> External connections \> `fraud-transactions-conn`
+1. You can find the service account ID under [BigQuery Studio](https://console.cloud.google.com/bigquery) \> Explorer \> ``{{ PROJECT_ID }}`` \> Connections \> `fraud-transactions-conn`
-2. Take note of the service account and grant it the `Vertex AI User` role.
+2. Take note of the service account and grant it the `Vertex AI User` role in [IAM](https://console.cloud.google.com/iam-admin).
3. Back in your [Dataform](https://console.cloud.google.com/bigquery/dataform) workspace, click Start execution from the top menu, then Execute Actions
diff --git a/docs/labs/4_ml.md b/docs/labs/4_ml.md
index 5791e8c..de756e5 100644
--- a/docs/labs/4_ml.md
+++ b/docs/labs/4_ml.md
@@ -80,7 +80,7 @@ Here you can can see that a model in the Vertex AI Model Registry is made up fro
The endpoint is created in a parallel branch in the pipeline you just ran. You can deploy models to an endpoint through the model registry.
-1. Click Online Prediction in the navigation menu
+1. Click Endpoints in the navigation menu
2. Click bootkon-endpoint
You can see that the endpoint has one model deployed currently, and all the traffic is routed to it (traffic split is 100%). When scrolling down, you get live graphs as soon as predictions are coming in.
@@ -91,8 +91,8 @@ You can also train and deploy models on Vertex in the UI only. Let's have a more
Let's have a look at the Pipeline as well.
-1. Click Pipelines in the navigation menu
-2. Click bootkon-pipeline-...
+1. Click Pipelines in the navigation menu
+2. Click bootkon-pipeline-...
You can now see the individual steps in the pipeline. Please click through the individual steps of the pipeline and have a look at the *Pipeline run analysis* on the right hand side as you cycle pipeline steps.
diff --git a/docs/labs/5_dataplex.md b/docs/labs/5_dataplex.md
index ab2fa81..f07600c 100644
--- a/docs/labs/5_dataplex.md
+++ b/docs/labs/5_dataplex.md
@@ -148,9 +148,8 @@ You can filter the data to be scanned for profiling by using row filters and col
Dataplex lets you specify a percentage of records from your data to sample for running a data profiling scan. Creating data profiling scans on a smaller sample of data can reduce the execution time and cost of querying the entire dataset.
Let's get started:
-
-1. Go to the Profile section in Dataplex.
-2. Click + CREATE DATA PROFILE SCAN
+1. Go to the Data profiling & quality section in Dataplex.
+2. Click Create data profile scan
3. Set Display Name to `bootkon-profile-fraud-prediction` for example
4. Optionally add a description. For example, "data profile scans for fraud detection predictions"
5. Leave the “Browse within Dataplex Lakes” option turned off
@@ -223,9 +222,9 @@ Creating and using a data quality scan consists of the following steps:
**Lab Instructions**
-1. Go to the [Data Quality](https://console.cloud.google.com/dataplex/govern/quality) section in the left hand menu of Dataplex.
+1. Go to the Data profiling & quality section in the left hand menu of Dataplex.
-2. Click on + CREATE DATA QUALITY SCAN
+2. Click on Create data quality scan
3. Display Name: `bootkon-dquality-fraud-prediction` for example
4. Optionally add a description. For example, "data quality scans for fraud detection predictions"
5. Leave the "Browse within Dataplex Lakes" option turned off
diff --git a/src/ml/pipeline.py b/src/ml/pipeline.py
index 6460f22..ccbf156 100644
--- a/src/ml/pipeline.py
+++ b/src/ml/pipeline.py
@@ -77,7 +77,7 @@ def pipeline(
display_name="bootkon-endpoint",
)
- ModelDeployOp(
+ model_deploy_op = ModelDeployOp(
endpoint=endpoint_create_op.outputs["endpoint"],
model=model_upload_op.outputs["model"],
deployed_model_display_name="bootkon-endpoint",
@@ -85,6 +85,7 @@ def pipeline(
dedicated_resources_min_replica_count=1,
dedicated_resources_max_replica_count=1
)
+ model_deploy_op.after(endpoint_create_op)
compiler.Compiler().compile(