Enh create registry (#6)

brightsparc · web-flow · commit 2efd23bdb66e · 2021-04-20T22:27:46.000+10:00
* Create model package groups in registry in CodeBuild, and remove from Notebook.  Update API to ensure deploye stage matches.

* Update to pass sagemaker tags for testing register

* Updates to include additional permission required to deploy CDK resources

* Updates to add experiments to project, and include tuning jobs

* Update to add boto retry

* Minor tweaks to README
diff --git a/README.md b/README.md
@@ -110,18 +110,53 @@ cdk bootstrap
 
 To bootstrap and deploy, you will require permissions create AWS CloudFormation Stacks and the associated resources for your current execution role.
 
-If you have cloned this notebook into SageMaker Studio, you can find your user's role by browsing to the Studio dashboard.
+If you have cloned this notebook into SageMaker Studio, you will need to add additional permissions to the SageMaker Studio execution role.  You can find your user's role by browsing to the Studio dashboard.
 
 ![\[AB Testing Pipeline Execution Role\]](docs/ab-testing-pipeline-execution-role.png)
 
 Browse to the [IAM](https://console.aws.amazon.com/iam) section in the console, and find this role.  Then attach the following managed policies.
 
-* `AWSCloudFormationFullAccess`
 * `AmazonAPIGatewayAdministrator`
+* `AmazonDynamoDBFullAccess`
+* `AmazonKinesisFirehoseFullAccess`
+* `CloudWatchEventsFullAccess`
+* `AWSCloudFormationFullAccess`
 * `AWSLambda_FullAccess`
-* `AmazonKinesisFullAccess`
 * `AWSServiceCatalogAdminFullAccess`
 
+Then, click the **Add inline policy** link, switch to to the **JSON** tab, and paste the following inline policy:
+
+```
+{
+    "Version": "2012-10-17",
+    "Statement": [
+        {
+            "Effect": "Allow",
+            "Action": [
+                "iam:AttachRolePolicy",
+                "iam:CreateRole",
+                "iam:GetRole",
+                "iam:PutRolePolicy",
+                "iam:PassRole",
+                "iam:DetachRolePolicy",
+                "iam:DeleteRolePolicy",
+                "iam:DeleteRole"
+            ],
+            "Resource": "arn:aws:iam::*:role/ab-testing-api-*"
+        },
+        {
+            "Effect": "Allow",
+            "Action": [
+                "logs:PutRetentionPolicy"
+            ],
+            "Resource": "arn:aws:logs:**:*:log-group:ab-testing-api-*"
+        }
+    ]
+}
+```
+
+Click **Review policy** and provide the name `CDK-CreateRolePolicy` then click **Create policy**
+
 ![\[AB Testing Pipeline Execution Role\]](docs/ab-testing-pipeline-iam-role.png)
 
 You should now be able to list the stacks by running:
@@ -146,22 +181,22 @@ Follow are a list of context values that are provided in the `cdk.json`, which c
 |---------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------|
 | `api_name`                | The API Gateway Name                                                                                                                                            | "ab-testing"                       |
 | `stage_name`              | The stage namespace for resource and API Gateway path                                                                                                           | "dev"                              |
-| `endpoint_prefix`         | A prefix to filter which Amazon SageMaker endpoints the API can invoked.                                                                                        | ""                                 |
+| `endpoint_prefix`         | A prefix to filter Amazon SageMaker endpoints the API can invoke.                                                                                               | ""                                 |
 | `api_lambda_memory`       | The [lambda memory](https://docs.aws.amazon.com/lambda/latest/dg/configuration-memory.html) allocation for API endpoint.                                        | 768                                |
 | `api_lambda_timeout`      | The lambda timeout for the API endpoint.                                                                                                                        | 10                                 |
 | `metrics_lambda_memory`   | The [lambda memory](https://docs.aws.amazon.com/lambda/latest/dg/configuration-memory.html) allocated for metrics processing Lambda                             | 768                                |
 | `metrics_lambda_timeout`  | The lambda timeout for the processing lambda.                                                                                                                   | 10                                 |
 | `dynamodb_read_capacity`  | The [Read Capacity](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/HowItWorks.ReadWriteCapacityMode.html) for the DynamoDB tables             | 5                                  |
 | `dynamodb_write_capacity` | The [Write Capacity](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/HowItWorks.ReadWriteCapacityMode.html) for the DynamoDB tables            | 5                                  |
-| `delivery_sync`           | When set to `true`, metrics will be written directly to DynamoDB in real-time, instead of written to Amazon Kinesis for processing (recommend for testing only) | false                              |
-| `firehose_interval`       | The [buffering](https://docs.aws.amazon.com/firehose/latest/dev/create-configure.html) interval in seconds at which the firehose will flush events to S3.       | 60                                 |
+| `delivery_sync`           | When`true` metrics will be written directly to DynamoDB, instead of the Amazon Kinesis for processing.                                                          | false                              |
+| `firehose_interval`       | The [buffering](https://docs.aws.amazon.com/firehose/latest/dev/create-configure.html) interval in seconds which firehose will flush events to S3.              | 60                                 |
 | `firehose_mb_size`        | The buffering size in MB before the firehose will flush its events to S3.                                                                                       | 1                                  |
 | `log_level`               | Logging level for AWS Lambda functions                                                                                                                          | "INFO"                             |
 
 Run the following command to deploy the API and testing infrastructure, optionally override context values.
 
 ```
-cdk deploy ab-testing-api
+cdk deploy ab-testing-api -c endpoint_prefix=ab-testing-pipeline
 ```
 
 This stack will ask you to confirm any changes, and output the `ApiEndpoint` which you will provide to the A/B Testing sample notebook.
@@ -320,6 +355,38 @@ With the Deployment Pipeline complete, you will be able to continue with the nex
 5. Plot the beta distributions of the course of the test.
 6. Calculate the statistical significance of the test.
 
+## Running Cost
+
+This section outlines cost considerations for running the A/B Testing Pipeline. Completing the pipeline will deploy an endpoint with 2 production variants which will cost less than $3 per day. Further cost breakdowns are below.
+
+- **CodeBuild** – Charges per minute used. First 100 minutes each month come at no charge. For information on pricing beyond the first 100 minutes, see [AWS CodeBuild Pricing](https://aws.amazon.com/codebuild/pricing/).
+- **CodeCommit** – $1/month if you didn't opt to use your own GitHub repository.
+- **CodePipeline** – CodePipeline costs $1 per active pipeline* per month. Pipelines are free for the first 30 days after creation. More can be found at [AWS CodePipeline Pricing](https://aws.amazon.com/codepipeline/pricing/).
+- **SageMaker** – Prices vary based on EC2 instance usage for the Notebook Instances, Model Hosting, Model Training and Model Monitoring; each charged per hour of use. For more information, see [Amazon SageMaker Pricing](https://aws.amazon.com/sagemaker/pricing/).
+  - The ten `ml.c5.4xlarge` *training jobs* run for approx 4 minutes at $0.81 an hour, and cost less than $1.
+  - The two `ml.t2.medium` instances for production *hosting* endpoint costs 2 x $0.056 per hour, or $2.68 per day.
+- **S3** – Low cost, prices will vary depending on the size of the models/artifacts stored. The first 50 TB each month will cost only $0.023 per GB stored. For more information, see [Amazon S3 Pricing](https://aws.amazon.com/s3/pricing/).
+- **API Gateway** - Low cost, $1.29 for first 300 million requests.  For more info see [Amazon API Gateway pricing](https://aws.amazon.com/api-gateway/pricing/)
+- **Lambda** - Low cost, $0.20 per 1 million request see [AWS Lambda Pricing](https://aws.amazon.com/lambda/pricing/).
+
+## Cleaning Up
+
+Once you have cleaned up the SageMaker Endpoints and Project as described in the [Sample Notebook](notebook/mab-reviews-helpfulness.ipynb), complete the clean up by deleting the **Service Catalog** and **API** resources with the AWS CDK:
+
+1. Delete the Service Catalog Portfolio and Project Template
+
+```
+cdk destroy ab-testing-service-catalog
+```
+
+2. Delete the API and testing infrastructure
+
+Before destroying the API stack, is is recommend you [empty](https://docs.aws.amazon.com/AmazonS3/latest/userguide/empty-bucket.html) and [delete](https://docs.aws.amazon.com/AmazonS3/latest/userguide/delete-bucket.html) the S3 Bucket that contains the S3 logs persisted by the Kinesis Firehose.
+
+```
+cdk destroy ab-testing-api
+```
+
 ## Want to know more?
 
 The [FAQ](FAQ.md) page has some answers to questions on the design principals of this sample. 
diff --git a/deployment_pipeline/app.py b/deployment_pipeline/app.py
@@ -55,6 +55,7 @@
     "ab-testing-sagemaker",
     deployment_config=deployment_config,
     project_name=project_name,
+    project_id=project_id,
     endpoint_name=endpoint_name,
     tags=tags,
 )
diff --git a/deployment_pipeline/infra/model_registry.py b/deployment_pipeline/infra/model_registry.py
@@ -2,6 +2,7 @@
 from datetime import datetime
 
 import boto3
+from botocore.config import Config
 from botocore.exceptions import ClientError
 
 logger = logging.getLogger(__name__)
@@ -13,7 +14,45 @@ class ModelRegistry:
     """
 
     def __init__(self):
-        self.sm_client = boto3.client("sagemaker")
+        config = Config(retries={"max_attempts": 10, "mode": "standard"})
+        self.sm_client = boto3.client("sagemaker", config=config)
+
+    def create_model_package_group(
+        self,
+        model_package_group_name: str,
+        description: str,
+        project_name: str,
+        project_id: str,
+    ):
+        """
+        Create the model package group if it doesn't exist.
+        """
+        try:
+            self.sm_client.create_model_package_group(
+                ModelPackageGroupName=model_package_group_name,
+                ModelPackageGroupDescription=description,
+                Tags=[
+                    {"Key": "sagemaker:project-name", "Value": project_name},
+                    {"Key": "sagemaker:project-id", "Value": project_id},
+                ],
+            )
+            logger.info(f"Model package group {model_package_group_name} created")
+            return True
+
+        except ClientError as e:
+            error_code = e.response["Error"]["Code"]
+            error_message = e.response["Error"]["Message"]
+            if (
+                error_code == "ValidationException"
+                and "Model Package Group already exists" in error_message
+            ):
+                logger.info(
+                    f"Model package group {model_package_group_name} already exists"
+                )
+                return False
+            else:
+                logger.error(error_message)
+                raise Exception(error_message)
 
     def get_latest_approved_packages(
         self,
diff --git a/deployment_pipeline/infra/sagemaker_stack.py b/deployment_pipeline/infra/sagemaker_stack.py
@@ -19,6 +19,7 @@ def __init__(
         construct_id: str,
         deployment_config: DeploymentConfig,
         project_name: str,
+        project_id: str,
         endpoint_name: str,
         tags: list,
         **kwargs,
@@ -28,10 +29,22 @@ def __init__(
         # Define the package group names for champion and challenger
         champion_package_group = f"{project_name}-champion"
         challenger_package_group = f"{project_name}-challenger"
+        challenger_creation_time: datetime = None
 
-        # Get the approved packages for the project
+        # Create the model package groups if they don't exist
         registry = ModelRegistry()
-        challenger_creation_time: datetime = None
+        registry.create_model_package_group(
+            champion_package_group,
+            "Champion Models for A/B Testing",
+            project_name,
+            project_id,
+        )
+        registry.create_model_package_group(
+            challenger_package_group,
+            "Challenger Models for A/B Testing",
+            project_name,
+            project_id,
+        )
 
         # If we don't have a specific champion variant defined, get the latest approved
         if deployment_config.champion_variant_config is None:
@@ -99,7 +112,7 @@ def __init__(
             f"arn:aws:iam::{self.account}:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole",
         )
 
-        # Add the challenger variant
+        # Add the champion and challenger variants
         model_configs = [
             deployment_config.champion_variant_config
         ] + deployment_config.challenger_variant_config
diff --git a/deployment_pipeline/infra/test_model_registry.py b/deployment_pipeline/infra/test_model_registry.py
@@ -17,6 +17,53 @@ def get_package(version: int, creation_time: datetime = datetime.fromtimestamp(0
     }
 
 
+@pytest.mark.skip(reason="botocore.exceptions.ParamValidationError: fails with Tags")
+def test_create_model_package_group():
+    # Create model registry
+    registry = ModelRegistry()
+
+    with Stubber(registry.sm_client) as stubber:
+        # Empty list with more
+        expected_params = {
+            "ModelPackageGroupDescription": "test package group",
+            "ModelPackageGroupName": "test-package-group",
+            "Tags": [
+                {"Key": "sagemaker:project-name", "Value": "test-project-name"},
+                {"Key": "sagemaker:project-id", "Value": "test-project-id"},
+            ],
+        }
+        expected_response = {
+            "ModelPackageGroupArn": f"arn:aws:sagemaker:REGION:ACCOUNT:model-package-group/test-package-group",
+        }
+        stubber.add_response(
+            "create_model_package_group", expected_response, expected_params
+        )
+
+        # Second time, add the client error if this exists
+        stubber.add_client_error(
+            "create_model_package_group",
+            "ValidationException",
+            "Model Package Group already exists",
+            expected_params=expected_params,
+        )
+
+        created = registry.create_model_package_group(
+            "test-package-group",
+            "test package group",
+            "test-project-name",
+            "test-project-id",
+        )
+        assert created == True
+
+        created = registry.create_model_package_group(
+            "test-package-group",
+            "test package group",
+            "test-project-name",
+            "test-project-id",
+        )
+        assert created == False
+
+
 def test_get_latest_approved_model_packages():
     # Create model registry
     registry = ModelRegistry()
diff --git a/deployment_pipeline/register.py b/deployment_pipeline/register.py
@@ -14,6 +14,7 @@
 
 # Load these from environment variables, that are passed into CodeBuild job from pipeline stack
 project_name = os.environ["SAGEMAKER_PROJECT_NAME"]
+project_id = os.environ["SAGEMAKER_PROJECT_ID"]
 stage_name = os.environ["STAGE_NAME"]
 register_lambda = os.environ["REGISTER_LAMBDA"]
 
@@ -24,20 +25,25 @@
 # Get the config and include with endpoint to register this model
 with open(f"{stage_name}-config.json", "r") as f:
     j = json.load(f)
-    event = json.dumps({
-        'source': 'aws.sagemaker',
-        'detail-type': 'SageMaker Endpoint State Change',
-        'detail': {
-            'EndpointName': endpoint_name,
-            'EndpointStatus': 'IN_SERVICE',
-            'Tags': {
-                'ab-testing:enabled': 'true',
-                'ab-testing:strategy': j.get('strategy', 'ThompsonSampling'),
-                'ab-testing:epsilon': str(j.get('epsilon', 0.1)),
-                'ab-testing:warmup': str(j.get('warmup', 0)),
-            }
+    event = json.dumps(
+        {
+            "source": "aws.sagemaker",
+            "detail-type": "SageMaker Endpoint State Change",
+            "detail": {
+                "EndpointName": endpoint_name,
+                "EndpointStatus": "IN_SERVICE",
+                "Tags": {
+                    "sagemaker:project-name": project_name,
+                    "sagemaker:project-id": project_id,
+                    "sagemaker:deployment-stage": stage_name,
+                    "ab-testing:enabled": "true",
+                    "ab-testing:strategy": j.get("strategy", "ThompsonSampling"),
+                    "ab-testing:epsilon": str(j.get("epsilon", 0.1)),
+                    "ab-testing:warmup": str(j.get("warmup", 0)),
+                },
+            },
         }
-    })
+    )
     response = lambda_client.invoke(
         FunctionName=register_lambda,
         InvocationType="RequestResponse",
@@ -47,5 +53,5 @@
     # Print the result, and if not succesful raise error
     result = json.loads(response["Payload"].read())
     print(result)
-    if result["statusCode"] != 200:
+    if result["statusCode"] not in [200, 201]:
         raise Exception("Unexpected status code: {}".format(result["statusCode"]))
diff --git a/deployment_pipeline/setup.py b/deployment_pipeline/setup.py
@@ -15,7 +15,7 @@
     package_dir={"": "infra"},
     packages=setuptools.find_packages(where="infra"),
     install_requires=[
-        "boto3==1.17.33",
+        "boto3>=1.17.54",
         "aws-cdk.core==1.94.1",
         "aws-cdk.aws-iam==1.94.1",
         "aws-cdk.aws-sagemaker==1.94.1",
diff --git a/docs/ab-testing-pipeline-iam-role.png b/docs/ab-testing-pipeline-iam-role.png
diff --git a/infra/api_stack.py b/infra/api_stack.py
@@ -145,7 +145,7 @@ def __init__(
             environment={
                 "METRICS_TABLE": metrics_table.table_name,
                 "DELIVERY_STREAM_NAME": delivery_stream_name,
-                "DELIVERY_SYNC": "true" if delivery_sync else "false",
+                "STAGE_NAME": stage_name,
                 "LOG_LEVEL": log_level,
                 "ENDPOINT_PREFIX": endpoint_prefix,
             },
diff --git a/lambda/api/lambda_register.py b/lambda/api/lambda_register.py
diff --git a/notebook/mab-reviews-helpfulness.ipynb b/notebook/mab-reviews-helpfulness.ipynb

Original file line number	Diff line number	Diff line change
`@@ -55,6 +55,7 @@`
`55`	`55`	`"ab-testing-sagemaker",`
`56`	`56`	`deployment_config=deployment_config,`
`57`	`57`	`project_name=project_name,`
	`58`	`+ project_id=project_id,`
`58`	`59`	`endpoint_name=endpoint_name,`
`59`	`60`	`tags=tags,`
`60`	`61`	`)`