diff --git a/.scripts/bk b/.scripts/bk index 8096ccd..cea70b9 100755 --- a/.scripts/bk +++ b/.scripts/bk @@ -65,13 +65,10 @@ export BK_REPO_URL="https://github.com/${BK_REPO}.git" export BK_TUTORIAL="${BK_TUTORIAL:-docs/TUTORIAL.md}" # defaults to .TUTORIAL.md; can be overwritten export BK_BRANCH="${BK_BRANCH:-main}" # defaults to main; can be overwritten export BK_DIR=~/${BK_GITHUB_REPOSITORY} -export BK_INIT_SCRIPT=~/${BK_GITHUB_REPOSITORY}/bk export BK_INITIALIZED=1 cd ~/ -pip install --quiet jinja2 nbformat nbconvert - if ! command -v git &> /dev/null; then sudo apt update sudo apt install -y git @@ -87,25 +84,37 @@ fi cd $BK_GITHUB_REPOSITORY NEW_PATH=~/${BK_GITHUB_REPOSITORY}/.scripts +PATH_EXPORT_LINE="export PATH=\${HOME}/${BK_GITHUB_REPOSITORY}/.scripts:\$PATH" -# Check if the new path is already in the PATH +# 1. Add to current session's PATH if missing if [[ ":$PATH:" != *":$NEW_PATH:"* ]]; then -echo -e "${MAGENTA}Adding $NEW_PATH to your PATH${NC}" + echo -e "${MAGENTA}Adding $NEW_PATH to your current session's PATH${NC}" export PATH=${NEW_PATH}:$PATH else - echo -e "${GREEN}Your PATH already contains $NEW_PATH. Not adding it again.${NC}" + echo -e "${GREEN}Your current session's PATH already contains $NEW_PATH. Not adding it again.${NC}" +fi + +# 2. Persist the PATH setting in ~/.bashrc if not already there +if ! grep -qF "$PATH_EXPORT_LINE" ~/.bashrc ; then + echo -e "${MAGENTA}Adding $NEW_PATH to ~/.bashrc for future sessions.${NC}" + # Use '>>' to append the line to the file + echo "$PATH_EXPORT_LINE" >> ~/.bashrc +else + echo -e "${GREEN}The permanent PATH export for $NEW_PATH is already in ~/.bashrc. Skipping.${NC}" fi + unset NEW_PATH +unset PATH_EXPORT_LINE -echo -e "Sourcing $(readlink -f vars.sh)" -source vars.sh +echo -e "Sourcing $(readlink -f $BK_DIR/vars.sh)" +source $BK_DIR/vars.sh -if [ -f vars.local.sh ]; then - echo -e "Sourcing $(readlink -f vars.local.sh)" - source vars.local.sh +if [ -f $BK_DIR/vars.local.sh ]; then + echo -e "Sourcing $(readlink -f $BK_DIR/vars.local.sh)" + source $BK_DIR/vars.local.sh fi -echo -e "Variables from vars.sh: PROJECT_ID=${YELLOW}$PROJECT_ID${NC} GCP_USERNAME=${YELLOW}$GCP_USERNAME${NC} REGION=${YELLOW}$REGION${NC}" +echo -e "Variables from $BK_DIR/vars.sh: PROJECT_ID=${YELLOW}$PROJECT_ID${NC} GCP_USERNAME=${YELLOW}$GCP_USERNAME${NC} REGION=${YELLOW}$REGION${NC}" if [ -z $PROJECT_ID ]; then @@ -138,24 +147,6 @@ else echo "$line" >> ~/.bashrc fi -## Set or update $BK_INIT_SCRIPT in ~/.bashrc -line="export BK_INIT_SCRIPT=~/${BK_GITHUB_REPOSITORY}/.scripts/bk" -if grep -q '^export BK_INIT_SCRIPT=' ~/.bashrc; then - # If the line exists but differs, update it - if ! grep -Fxq "$line" ~/.bashrc; then - sed -i "s|^export BK_INIT_SCRIPT=.*|$line|" ~/.bashrc - echo "Updated the existing BK_INIT_SCRIPT line in ~/.bashrc." - fi -else - echo "$line" >> ~/.bashrc -fi - -## Load $BK_INIT_SCRIPT in ~/.bashrc -line='if [ -f ${BK_INIT_SCRIPT} ]; then source ${BK_INIT_SCRIPT}; fi' -grep -qxF "$line" ~/.bashrc || echo "$line" >> ~/.bashrc - -unset line - echo echo -e " __ --------------------------------------------------------" echo -e " _(\ |${RED}@@${NC}| | |" @@ -171,4 +162,4 @@ echo if [ "$(basename ${BASH_SOURCE[0]})" != "bk" ]; then # This script is run the first time from GitHub bk-start -fi \ No newline at end of file +fi diff --git a/.scripts/bk-bootstrap b/.scripts/bk-bootstrap index 14b0aae..90c9dde 100755 --- a/.scripts/bk-bootstrap +++ b/.scripts/bk-bootstrap @@ -58,3 +58,19 @@ for role in "${service_account_roles[@]}"; do gcloud projects add-iam-policy-binding "$PROJECT_ID" \ --member="serviceAccount:$COMPUTE_SERVICE_ACCOUNT" --role="$role" >>/dev/null done + +VERTEX_AI_CC_SERVICE_ACCOUNT="service-$PROJECT_NUMBER@gcp-sa-aiplatform-cc.iam.gserviceaccount.com" + +# Array of roles to grant to the Vertex AI Custom Code Service Agent +declare -a vertex_cc_service_agent_roles=( + "roles/artifactregistry.reader" # AI Platform Artifact Registry Reader + "roles/artifactregistry.serviceAgent" # Custom Artifact Registry Service Agent + "roles/aiplatform.customCodeServiceAgent" # Vertex AI Custom Code Service Agent +) + +# Assign roles to the Vertex AI Custom Code Service Account +#for role in "${vertex_cc_service_agent_roles[@]}"; do +# echo "Assigning role $role to $VERTEX_AI_CC_SERVICE_ACCOUNT in project $PROJECT_ID..." +# gcloud projects add-iam-policy-binding "$PROJECT_ID" \ +# --member="serviceAccount:$VERTEX_AI_CC_SERVICE_ACCOUNT" --role="$role" >>/dev/null +#done diff --git a/.scripts/bk-render-jinja2 b/.scripts/bk-render-jinja2 index 197d86a..2f223c2 100755 --- a/.scripts/bk-render-jinja2 +++ b/.scripts/bk-render-jinja2 @@ -6,10 +6,8 @@ import json import sys import os import re -import nbformat import base64 from functools import partial -from nbconvert import HTMLExporter, MarkdownExporter import jinja2 @@ -42,6 +40,9 @@ def apply_to_content(data, func): def render_jupyter(path): + import nbformat + from nbconvert import HTMLExporter, MarkdownExporter + with open(path) as f: nb = nbformat.read(f, as_version=4) exporter = HTMLExporter() diff --git a/.scripts/bk-start b/.scripts/bk-start index afc1bda..047e828 100755 --- a/.scripts/bk-start +++ b/.scripts/bk-start @@ -1,5 +1,13 @@ #!/bin/sh cd $BK_DIR + +if [ -z "$BK_INITIALIZED" ]; then + echo "Bootkon has not been initialized." + echo "Please execute: " + echo " . bk (including the dot)" + exit 1 +fi + bk-tutorial $BK_TUTORIAL cloudshell open-workspace . \ No newline at end of file diff --git a/docs/TUTORIAL.md b/docs/TUTORIAL.md index a59447d..57db1db 100644 --- a/docs/TUTORIAL.md +++ b/docs/TUTORIAL.md @@ -38,19 +38,21 @@ and set `GCP_USERNAME`, `PROJECT_ID` according to the information you received. ❗ Please do not include any whitespaces when setting these variablers. -Please reload bootkon and make sure there are no errors printed: +Please initialize bootkon. The next command will set environment variables in your current terminal. ```bash . bk ``` - -And restart the tutorial using the next command. You can also use the next command to continue bootkon in case you accidentally close the tutorial or the editor: +Reload the tutorial window on the right-hand side of your screen. ```bash bk-start ``` +In case you accidently close the tutorial or the editor, you can execute `bk-start` to start it again. Please make sure that you execute `. bk` in every terminal +you open so that the environment variables are set. + Now, your * `PROJECT_ID` is `{% if PROJECT_ID == "" %}None{% else %}{{ PROJECT_ID }}{% endif %}` @@ -85,4 +87,4 @@ The authors of Data & AI Bootkon are: Data & AI Bootkon received contributions from many people, including: - [Christine Schulze](https://www.linkedin.com/in/christine-schulze-33822765/) - [Daniel Quinlan](https://www.linkedin.com/in/%F0%9F%8C%8Ddaniel-quinlan-51126016/) -- [Dinesh Sandra](https://www.linkedin.com/in/sandradinesh/) \ No newline at end of file +- [Dinesh Sandra](https://www.linkedin.com/in/sandradinesh/) diff --git a/docs/labs/2_data_ingestion.md b/docs/labs/2_data_ingestion.md index c187c6f..214903d 100644 --- a/docs/labs/2_data_ingestion.md +++ b/docs/labs/2_data_ingestion.md @@ -43,11 +43,12 @@ echo $CONN_SERVICE_ACCOUNT ``` Let's double check the service account. - + 1. Go to the [BigQuery Console](https://console.cloud.google.com/bigquery). -2. Expand {{ PROJECT_ID }} -3. Expand External connections -4. Click ``us.fraud-transactions-conn``. +2. Click Explorer +3. Expand {{ PROJECT_ID }} +4. Click Connections +5. Click fraud-transactions-conn Is the service account equivalent to the one you got from the command line? @@ -62,10 +63,11 @@ gcloud storage buckets add-iam-policy-binding gs://{{ PROJECT_ID }}-bucket \ Let's create a data set that contains the table and the external connection to Cloud Storage. 1. Go to the [BigQuery Console](https://console.cloud.google.com/bigquery) -2. Click the three vertical dots ⋮ next to `{{ PROJECT_ID }}` in the navigation menu -3. Click Create dataset -4. Enter `ml_datasets` (plural) in the ID field. Region should be multi-region US. -5. Click Create dataset +2. Choose Explorer +3. Hover your mouse over {{ PROJECT_ID }} +4. Click the three vertical dots (⋮) and go to `Create dataset` +5. Enter `ml_datasets` (plural) in the ID field. Region should be multi-region US. +6. Click `Create dataset` Alternatively, you can create the data set on the command line: ```bash @@ -73,10 +75,11 @@ bq --location=us mk -d ml_datasets ``` Next, we connect the data in Cloud Storage to BigQuery: -1. Click + Add data -2. Click Google Cloud Storage -3. Select `Load to BigQuery` -4. Enter the following details: +1. Choose Explorer +2. Click + Add data +3. Click Google Cloud Storage +4. Select `Load to BigQuery` +5. Enter the following details: - Create table from: `Google Cloud Storage` - Select file: `{{ PROJECT_ID }}-bucket/data/parquet/ulb_fraud_detection/*` - File format: `Parquet` @@ -87,7 +90,7 @@ Next, we connect the data in Cloud Storage to BigQuery: - Check *Create a BigLake table using a Cloud Resource connection* - Connection ID: Select `us.fraud-transactions-conn` - Schema: `Auto detect` -5. Click on Create table +6. Click on Create table Alternatively, you can also use the command line to create the table: @@ -99,17 +102,19 @@ bq mk --table \ Let's have a look at the data set: 1. Go to the [BigQuery Console](https://console.cloud.google.com/bigquery) -2. Expand {{ PROJECT_ID }} -3. Expand ml_datasets -4. Click ``ulb_fraud_detection_biglake`` -5. Click DETAILS +2. Choose Explorer +3. Expand {{ PROJECT_ID }} +4. Click Datasets +5. Click ml_datasets +6. Click ulb_fraud_detection_biglake +7. Click Details Have a look at the external data configuration. You can see the Cloud Storage bucket (`gs://...`) your data lives in. Let's query it: -1. Click QUERY +1. Click Query 2. Insert the following SQL query. ```sql diff --git a/docs/labs/3_dataform.md b/docs/labs/3_dataform.md index fb32960..1b3e440 100644 --- a/docs/labs/3_dataform.md +++ b/docs/labs/3_dataform.md @@ -209,11 +209,11 @@ Go to [Dataform](https://console.cloud.google.com/bigquery/dataform)\> ``{{ PROJECT_ID }}`` \> External connections \> `fraud-transactions-conn` +1. You can find the service account ID under [BigQuery Studio](https://console.cloud.google.com/bigquery) \> Explorer \> ``{{ PROJECT_ID }}`` \> Connections \> `fraud-transactions-conn` serviceaccountconnection -2. Take note of the service account and grant it the `Vertex AI User` role. +2. Take note of the service account and grant it the `Vertex AI User` role in [IAM](https://console.cloud.google.com/iam-admin). vertexairole 3. Back in your [Dataform](https://console.cloud.google.com/bigquery/dataform) workspace, click Start execution from the top menu, then Execute Actions diff --git a/docs/labs/4_ml.md b/docs/labs/4_ml.md index 5791e8c..de756e5 100644 --- a/docs/labs/4_ml.md +++ b/docs/labs/4_ml.md @@ -80,7 +80,7 @@ Here you can can see that a model in the Vertex AI Model Registry is made up fro The endpoint is created in a parallel branch in the pipeline you just ran. You can deploy models to an endpoint through the model registry. -1. Click Online Prediction in the navigation menu +1. Click Endpoints in the navigation menu 2. Click bootkon-endpoint You can see that the endpoint has one model deployed currently, and all the traffic is routed to it (traffic split is 100%). When scrolling down, you get live graphs as soon as predictions are coming in. @@ -91,8 +91,8 @@ You can also train and deploy models on Vertex in the UI only. Let's have a more Let's have a look at the Pipeline as well. -1. Click Pipelines in the navigation menu -2. Click bootkon-pipeline-... +1. Click Pipelines in the navigation menu +2. Click bootkon-pipeline-... You can now see the individual steps in the pipeline. Please click through the individual steps of the pipeline and have a look at the *Pipeline run analysis* on the right hand side as you cycle pipeline steps. diff --git a/docs/labs/5_dataplex.md b/docs/labs/5_dataplex.md index ab2fa81..f07600c 100644 --- a/docs/labs/5_dataplex.md +++ b/docs/labs/5_dataplex.md @@ -148,9 +148,8 @@ You can filter the data to be scanned for profiling by using row filters and col Dataplex lets you specify a percentage of records from your data to sample for running a data profiling scan. Creating data profiling scans on a smaller sample of data can reduce the execution time and cost of querying the entire dataset. Let's get started: - -1. Go to the Profile section in Dataplex. -2. Click + CREATE DATA PROFILE SCAN +1. Go to the Data profiling & quality section in Dataplex. +2. Click Create data profile scan 3. Set Display Name to `bootkon-profile-fraud-prediction` for example 4. Optionally add a description. For example, "data profile scans for fraud detection predictions" 5. Leave the “Browse within Dataplex Lakes” option turned off @@ -223,9 +222,9 @@ Creating and using a data quality scan consists of the following steps: **Lab Instructions** -1. Go to the [Data Quality](https://console.cloud.google.com/dataplex/govern/quality) section in the left hand menu of Dataplex. +1. Go to the Data profiling & quality section in the left hand menu of Dataplex. -2. Click on + CREATE DATA QUALITY SCAN +2. Click on Create data quality scan 3. Display Name: `bootkon-dquality-fraud-prediction` for example 4. Optionally add a description. For example, "data quality scans for fraud detection predictions" 5. Leave the "Browse within Dataplex Lakes" option turned off diff --git a/src/ml/pipeline.py b/src/ml/pipeline.py index 6460f22..ccbf156 100644 --- a/src/ml/pipeline.py +++ b/src/ml/pipeline.py @@ -77,7 +77,7 @@ def pipeline( display_name="bootkon-endpoint", ) - ModelDeployOp( + model_deploy_op = ModelDeployOp( endpoint=endpoint_create_op.outputs["endpoint"], model=model_upload_op.outputs["model"], deployed_model_display_name="bootkon-endpoint", @@ -85,6 +85,7 @@ def pipeline( dedicated_resources_min_replica_count=1, dedicated_resources_max_replica_count=1 ) + model_deploy_op.after(endpoint_create_op) compiler.Compiler().compile(