oracle-samples
diff --git a/‎feature_store/feature_store_creation_ingestion_with_jobs/README.md‎
Lines changed: 31 additions & 0 deletions b/‎feature_store/feature_store_creation_ingestion_with_jobs/README.md‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎feature_store/feature_store_creation_ingestion_with_jobs/feature_Store_ingestion.py‎
Lines changed: 8 additions & 0 deletions b/‎feature_store/feature_store_creation_ingestion_with_jobs/feature_Store_ingestion.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎feature_store/feature_store_creation_ingestion_with_jobs/feature_store_creation_example.py‎
Lines changed: 147 additions & 0 deletions b/‎feature_store/feature_store_creation_ingestion_with_jobs/feature_store_creation_example.py‎
Lines changed: 147 additions & 0 deletions
@@ -0,0 +1,31 @@
+Feature Store Creation and Ingestion using ML Job
+=====================
+
+In this Example, you use the Oracle Cloud Infrastructure (OCI) Data Science service MLJob component to create OCI Feature store design time constructs and then ingest feature values into the offline feature store.
+
+Tutorial picks use case of Electronic Heath Data consisting of Patient Test Results. The example demonstrates creation of feature store, entity , transformation and feature group design time constructs using a python script which is provided as job artifact. Another job artifact demonstrates ingestion of feature values into pre-created feature group.
+
+# Prerequisites
+
+The notebook makes connections to other OCI resources. This is done using [resource principals](https://docs.oracle.com/en-us/iaas/Content/Functions/Tasks/functionsaccessingociresources.htm). If you have not configured your tenancy to use resource principals then you can do so using the instructions that are [here](https://docs.oracle.com/en-us/iaas/data-science/using/create-dynamic-groups.htm). Alternatively, you can use API keys. The preferred method for authentication is resource principals.
+
+
+# Instructions
+
+1. Open a Data Science Notebook session (i.e. JupyterLab).
+1. Open a file terminal by clicking on File -> New -> Terminal.
+1. In the terminal run the following commands:
+1. `odsc conda install -s fspyspark32_p38_cpu_v1` to install the feature store conda.
+    1. `conda activate /home/datascience/conda/fspyspark32_p38_cpu_v1` to activate the conda.
+1. Copy the `notebooks` folder into the notebook session.
+1. Open the notebook `notebook/feature_store_using_mljob.ipynb`.
+1. Change the notebook kernel to `Python [conda env:fspyspark32_p38_cpu_v1]`.
+1. Read the notebook and execute each cell.
+1. Once the ml job run is completed successfully, user can validate creation of feature store construct using the feature store notebook ui extension.
+1. Now open the notebook `notebook/feature_store_ingestion_via_mljob.ipynb`.
+1. Change the notebook kernel to `Python [conda env:fspyspark32_p38_cpu_v1]`.
+1. Read the notebook and execute each cell.
+1. validate the ingestion ml job is executed successfully.
+1. User can validate the ingested data and other metadata using the feature store notebook ui extension.
+
+
@@ -0,0 +1,8 @@
+from ads.feature_store.feature_group import FeatureGroup
+import pandas as pd
+import ads
+ads.set_auth(auth="resource_principal", client_kwargs={"service_endpoint": "https://fnk6p6iswuttzxwffxq6uwpj2u.apigateway.us-ashburn-1.oci.customer-oci.com/20230101"})
+ehr_feature_group = FeatureGroup.from_id("FED8117CDF1EE54F5A742EFFA2A88433")
+patient_result_df = pd.read_csv("https://objectstorage.us-ashburn-1.oraclecloud.com/p/hh2NOgFJbVSg4amcLM3G3hkTuHyBD-8aE_iCsuZKEvIav1Wlld-3zfCawG4ycQGN/n/ociodscdev/b/oci-feature-store/o/beta/data/EHR/data-ori.csv")
+if ehr_feature_group:
+    ehr_feature_group.materialise(patient_result_df)
@@ -0,0 +1,147 @@
+import os
+import argparse
+
+print("Initiating feature store lazy entities creation")
+
+# Set NAME as environment variable on the job
+NAME = os.environ.get("NAME", "Job")
+
+# set -g as command line argument on the job
+# parser = argparse.ArgumentParser(allow_abbrev=False)
+# parser.add_argument("-g", "--greeting", required=False, default="Hello")
+# args, unknown = parser.parse_known_args()
+
+# to debug
+# print(f'args: {args}')
+# print(f'unknown: {unknown}')
+
+import ads
+ads.set_auth(auth="resource_principal", client_kwargs={"service_endpoint": "https://fnk6p6iswuttzxwffxq6uwpj2u.apigateway.us-ashburn-1.oci.customer-oci.com/20230101"})
+import os
+
+compartment_id = "ocid1.tenancy.oc1..aaaaaaaa462hfhplpx652b32ix62xrdijppq2c7okwcqjlgrbknhgtj2kofa"
+metastore_id = "ocid1.datacatalogmetastore.oc1.iad.amaaaaaabiudgxyap7tizm4gscwz7amu7dixz7ml3mtesqzzwwg3urvvdgua"
+import pandas as pd
+from ads.feature_store.feature_store import FeatureStore
+from ads.feature_store.feature_group import FeatureGroup
+from ads.feature_store.model_details import ModelDetails
+from ads.feature_store.dataset import Dataset
+from ads.feature_store.common.enums import DatasetIngestionMode
+
+from ads.feature_store.feature_group_expectation import ExpectationType
+from great_expectations.core import ExpectationSuite, ExpectationConfiguration
+from ads.feature_store.feature_store_registrar import FeatureStoreRegistrar
+
+patient_result_df = pd.read_csv("https://objectstorage.us-ashburn-1.oraclecloud.com/p/hh2NOgFJbVSg4amcLM3G3hkTuHyBD-8aE_iCsuZKEvIav1Wlld-3zfCawG4ycQGN/n/ociodscdev/b/oci-feature-store/o/beta/data/EHR/data-ori.csv")
+
+print(f"The dataset contains {patient_result_df.shape[0]} rows and {patient_result_df.shape[1]} columns")
+
+# get all the features
+features = [feat for feat in patient_result_df.columns if feat !='SOURCE']
+num_features = [feat for feat in features if patient_result_df[feat].dtype != object]
+cat_features = [feat for feat in features if patient_result_df[feat].dtype == object]
+
+print(f"Total number of features : {len(features)}")
+print(f"Number of numerical features : {len(num_features)}")
+print(f"Number of categorical features : {len(cat_features)}\n")
+print(patient_result_df.isna().mean().to_frame(name='Missing %'))
+print(patient_result_df.nunique().to_frame(name='# of unique values'))
+feature_store_resource = (
+    FeatureStore().
+    with_description("Electronic Heath Data consisting of Patient Test Results").
+    with_compartment_id(compartment_id).
+    with_display_name("EHR details").
+    with_offline_config(metastore_id=metastore_id)
+)
+feature_store = feature_store_resource.create()
+print(feature_store)
+entity = feature_store.create_entity(
+    display_name="EHR",
+    description="Electronic Health Record predictions"
+)
+print(entity)
+
+def chained_transformation(patient_result_df, **transformation_args):
+    def label_encoder_transformation(patient_result_df, **transformation_args):
+        from sklearn.preprocessing import LabelEncoder
+        # creating instance of labelencoder
+        labelencoder = LabelEncoder()
+        result_df = patient_result_df.copy()
+        column_labels= transformation_args.get("label_encode_column")
+        if isinstance(column_labels,list):
+            for col in column_labels:
+                result_df[col] = labelencoder.fit_transform(result_df[col])
+        elif isinstance(column_labels, str):
+             result_df[column_labels] = labelencoder.fit_transform(result_df[column_labels])
+        else:
+            return None
+        return result_df
+    
+    def min_max_scaler(patient_result_df, **transformation_args):
+        from sklearn.preprocessing import MinMaxScaler
+        final_result_df = patient_result_df.copy()
+        scaler = MinMaxScaler(feature_range=(0, 1))
+        column_labels= transformation_args.get("scaling_column_labels")
+        final_result_df[column_labels] = scaler.fit_transform(final_result_df[column_labels])
+        return patient_result_df
+    
+    def feature_removal(input_df, **transformation_args):
+        output_df = input_df.copy()
+        output_df.drop(transformation_args.get("redundant_feature_label"), axis=1, inplace=True)
+        return output_df
+
+    out1 = label_encoder_transformation(patient_result_df, **transformation_args)
+    out2 = min_max_scaler(out1, **transformation_args)
+    return feature_removal(out2, **transformation_args)
+
+transformation_args = {
+    "label_encode_column": ["SEX","SOURCE"],
+    "scaling_column_labels": num_features,
+    "redundant_feature_label": ["MCH", "MCHC", "MCV"]
+}
+
+from ads.feature_store.transformation import Transformation,TransformationMode
+
+transformation = (
+    Transformation()
+    .with_display_name("chained_transformation")
+    .with_feature_store_id(feature_store.id)
+    .with_source_code_function(chained_transformation)
+    .with_transformation_mode(TransformationMode.PANDAS)
+    .with_description("transformation to perform feature engineering")
+    .with_compartment_id(compartment_id)
+)
+
+transformation.create()
+
+feature_group_ehr = (
+    FeatureGroup()
+    .with_feature_store_id(feature_store.id)
+    .with_primary_keys([])
+    .with_name("ehr_feature_group")
+    .with_entity_id(entity.id)
+    .with_compartment_id(compartment_id)
+    .with_schema_details_from_dataframe(patient_result_df)
+    .with_transformation_id(transformation.id)
+    .with_transformation_kwargs(transformation_args)
+)
+feature_group_ehr.create()
+expectation_suite_ehr = ExpectationSuite(
+    expectation_suite_name="test_hcm_df"
+)
+expectation_suite_ehr.add_expectation(
+    ExpectationConfiguration(
+        expectation_type="expect_column_values_to_not_be_null",
+        kwargs={"column": "AGE"},
+    )
+)
+expectation_suite_ehr.add_expectation(
+    ExpectationConfiguration(
+        expectation_type="expect_column_values_to_be_between",
+        kwargs={"column": "HAEMOGLOBINS", "min_value": 0, "max_value": 30},
+    )
+)
+from ads.feature_store.common.enums import ExpectationType
+
+feature_group_ehr.with_expectation_suite(expectation_suite_ehr, expectation_type = ExpectationType.STRICT)
+feature_group_ehr.update()