|
1 | | -{ |
2 | | - "cells": [ |
3 | | - { |
4 | | - "cell_type": "code", |
5 | | - "execution_count": 1, |
6 | | - "metadata": {}, |
7 | | - "outputs": [ |
8 | | - { |
9 | | - "ename": "ModuleNotFoundError", |
10 | | - "evalue": "No module named 'jmespath'", |
11 | | - "output_type": "error", |
12 | | - "traceback": [ |
13 | | - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", |
14 | | - "\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)", |
15 | | - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 7\u001b[39m\n\u001b[32m 5\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpyspark\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01msql\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m DataFrame\n\u001b[32m 6\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpyspark\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01msql\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m SparkSession\n\u001b[32m----> \u001b[39m\u001b[32m7\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mdbldatagen\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m DataGenerator\n\u001b[32m 9\u001b[39m \u001b[38;5;66;03m# Initialize Spark session\u001b[39;00m\n\u001b[32m 10\u001b[39m spark = SparkSession.builder \\\n\u001b[32m 11\u001b[39m .appName(\u001b[33m\"\u001b[39m\u001b[33mSynthetic Data Test\u001b[39m\u001b[33m\"\u001b[39m) \\\n\u001b[32m 12\u001b[39m .getOrCreate()\n", |
16 | | - "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/GitHub/python-feature-set/.venv/lib/python3.13/site-packages/dbldatagen/__init__.py:26\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[32m 2\u001b[39m \u001b[38;5;66;03m# Copyright (C) 2019 Databricks, Inc.\u001b[39;00m\n\u001b[32m 3\u001b[39m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 14\u001b[39m \u001b[38;5;66;03m# limitations under the License.\u001b[39;00m\n\u001b[32m 15\u001b[39m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[32m 17\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 18\u001b[39m \u001b[33;03mThis module defines the package contents for the test data generator library\u001b[39;00m\n\u001b[32m 19\u001b[39m \n\u001b[32m (...)\u001b[39m\u001b[32m 23\u001b[39m \u001b[33;03mMost of the other classes are used for internal purposes only\u001b[39;00m\n\u001b[32m 24\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m26\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdata_generator\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m DataGenerator\n\u001b[32m 27\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdatagen_constants\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m DEFAULT_RANDOM_SEED, RANDOM_SEED_RANDOM, RANDOM_SEED_FIXED, \\\n\u001b[32m 28\u001b[39m RANDOM_SEED_HASH_FIELD_NAME, MIN_PYTHON_VERSION, MIN_SPARK_VERSION, \\\n\u001b[32m 29\u001b[39m INFER_DATATYPE, SPARK_DEFAULT_PARALLELISM\n\u001b[32m 30\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01m.\u001b[39;00m\u001b[34;01mutils\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m ensure, topologicalSort, mkBoundsList, coalesce_values, \\\n\u001b[32m 31\u001b[39m deprecated, parse_time_interval, DataGenError, split_list_matching_condition, strip_margins, \\\n\u001b[32m 32\u001b[39m json_value_from_path, system_time_millis\n", |
17 | | - "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/GitHub/python-feature-set/.venv/lib/python3.13/site-packages/dbldatagen/data_generator.py:15\u001b[39m\n\u001b[32m 12\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpyspark\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01msql\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mtypes\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m LongType, IntegerType, StringType, StructType, StructField, DataType\n\u001b[32m 14\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01m.\u001b[39;00m\u001b[34;01m_version\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m _get_spark_version\n\u001b[32m---> \u001b[39m\u001b[32m15\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01m.\u001b[39;00m\u001b[34;01mcolumn_generation_spec\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m ColumnGenerationSpec\n\u001b[32m 16\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01m.\u001b[39;00m\u001b[34;01mconstraints\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mconstraint\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Constraint\n\u001b[32m 17\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01m.\u001b[39;00m\u001b[34;01mconstraints\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01msql_expr\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m SqlExpr\n", |
18 | | - "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/GitHub/python-feature-set/.venv/lib/python3.13/site-packages/dbldatagen/column_generation_spec.py:21\u001b[39m\n\u001b[32m 16\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpyspark\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01msql\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mfunctions\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mF\u001b[39;00m\n\u001b[32m 18\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpyspark\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01msql\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mtypes\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m FloatType, IntegerType, StringType, DoubleType, BooleanType, \\\n\u001b[32m 19\u001b[39m TimestampType, DataType, DateType, ArrayType, MapType, StructType\n\u001b[32m---> \u001b[39m\u001b[32m21\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01m.\u001b[39;00m\u001b[34;01mcolumn_spec_options\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m ColumnSpecOptions\n\u001b[32m 22\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdatagen_constants\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m RANDOM_SEED_FIXED, RANDOM_SEED_HASH_FIELD_NAME, RANDOM_SEED_RANDOM, \\\n\u001b[32m 23\u001b[39m DEFAULT_SEED_COLUMN, OPTION_RANDOM, OPTION_RANDOM_SEED, OPTION_RANDOM_SEED_METHOD, INFER_DATATYPE\n\u001b[32m 25\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdaterange\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m DateRange\n", |
19 | | - "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/GitHub/python-feature-set/.venv/lib/python3.13/site-packages/dbldatagen/column_spec_options.py:13\u001b[39m\n\u001b[32m 5\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 6\u001b[39m \u001b[33;03m.. title::Column Spec Options\u001b[39;00m\n\u001b[32m 7\u001b[39m \n\u001b[32m 8\u001b[39m \u001b[33;03mThis file defines the `ColumnSpecOptions` class\u001b[39;00m\n\u001b[32m 9\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 11\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mcopy\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m13\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01m.\u001b[39;00m\u001b[34;01mutils\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m ensure\n\u001b[32m 16\u001b[39m \u001b[38;5;28;01mclass\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mColumnSpecOptions\u001b[39;00m(\u001b[38;5;28mobject\u001b[39m):\n\u001b[32m 17\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\" Column spec options object - manages options for column specs.\u001b[39;00m\n\u001b[32m 18\u001b[39m \n\u001b[32m 19\u001b[39m \u001b[33;03m This class has limited functionality - mainly used to validate and document the options,\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 140\u001b[39m \n\u001b[32m 141\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n", |
20 | | - "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/GitHub/python-feature-set/.venv/lib/python3.13/site-packages/dbldatagen/utils.py:18\u001b[39m\n\u001b[32m 15\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mwarnings\u001b[39;00m\n\u001b[32m 16\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mdatetime\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m timedelta\n\u001b[32m---> \u001b[39m\u001b[32m18\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mjmespath\u001b[39;00m\n\u001b[32m 21\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mdeprecated\u001b[39m(message=\u001b[33m\"\u001b[39m\u001b[33m\"\u001b[39m):\n\u001b[32m 22\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 23\u001b[39m \u001b[33;03m Define a deprecated decorator without dependencies on 3rd party libraries\u001b[39;00m\n\u001b[32m 24\u001b[39m \n\u001b[32m 25\u001b[39m \u001b[33;03m Note there is a 3rd party library called `deprecated` that provides this feature but goal is to only have\u001b[39;00m\n\u001b[32m 26\u001b[39m \u001b[33;03m dependencies on packages already used in the Databricks runtime\u001b[39;00m\n\u001b[32m 27\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n", |
21 | | - "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'jmespath'" |
22 | | - ] |
23 | | - } |
24 | | - ], |
25 | | - "source": [ |
26 | | - "# This script generates synthetic data using the Databricks dbldatagen library\n", |
27 | | - "# and performs basic analysis on the generated data.\n", |
28 | | - "# Ensure you have the dbldatagen library installed in your Databricks environment.\n", |
29 | | - "# Import necessary libraries\n", |
30 | | - "from pyspark.sql import DataFrame\n", |
31 | | - "from pyspark.sql import SparkSession\n", |
32 | | - "from dbldatagen import DataGenerator\n", |
33 | | - "\n", |
34 | | - "# Initialize Spark session\n", |
35 | | - "spark = SparkSession.builder \\\n", |
36 | | - " .appName(\"Synthetic Data Test\") \\\n", |
37 | | - " .getOrCreate()\n", |
38 | | - "\n", |
39 | | - "def generate_synthetic_data(rows=1000, partitions=4):\n", |
40 | | - " \"\"\"\n", |
41 | | - " Function to generate synthetic data using Databricks dbldatagen.\n", |
42 | | - " \n", |
43 | | - " Args:\n", |
44 | | - " rows (int): Number of rows to generate.\n", |
45 | | - " partitions (int): Number of partitions for the generated data.\n", |
46 | | - " \n", |
47 | | - " Returns:\n", |
48 | | - " DataFrame: A Spark DataFrame containing the synthetic data.\n", |
49 | | - " \"\"\"\n", |
50 | | - " # Define the schema and data generation rules\n", |
51 | | - " data_spec = (DataGenerator(spark, name=\"synthetic_data\", rows=rows, partitions=partitions)\n", |
52 | | - " .withIdOutput()\n", |
53 | | - " .withColumn(\"name\", \"string\", values=[\"Alice\", \"Bob\", \"Charlie\", \"David\"])\n", |
54 | | - " .withColumn(\"age\", \"integer\", minValue=18, maxValue=60)\n", |
55 | | - " .withColumn(\"salary\", \"float\", minValue=30000, maxValue=120000)\n", |
56 | | - " .withColumn(\"department\", \"string\", values=[\"HR\", \"Engineering\", \"Marketing\", \"Sales\"])\n", |
57 | | - " .withColumn(\"join_date\", \"date\", begin=\"2020-01-01\", end=\"2023-12-31\"))\n", |
58 | | - "\n", |
59 | | - " # Generate the synthetic data\n", |
60 | | - " synthetic_data = data_spec.build()\n", |
61 | | - " return synthetic_data\n", |
62 | | - "\n", |
63 | | - "def analyze_data(dataframe):\n", |
64 | | - " \"\"\"\n", |
65 | | - " Function to analyze the test data.\n", |
66 | | - " \n", |
67 | | - " Args:\n", |
68 | | - " dataframe (DataFrame): A Spark DataFrame to analyze.\n", |
69 | | - " \n", |
70 | | - " Returns:\n", |
71 | | - " None\n", |
72 | | - " \"\"\"\n", |
73 | | - " print(\"Schema of the DataFrame:\")\n", |
74 | | - " dataframe.printSchema()\n", |
75 | | - " \n", |
76 | | - " print(\"\\nSample Data:\")\n", |
77 | | - " dataframe.show(10, truncate=False)\n", |
78 | | - " \n", |
79 | | - " print(\"\\nSummary Statistics:\")\n", |
80 | | - " dataframe.describe().show()\n", |
81 | | - "\n", |
82 | | - "# Example usage\n", |
83 | | - "if __name__ == \"__main__\":\n", |
84 | | - " # Generate synthetic data\n", |
85 | | - " synthetic_df = generate_synthetic_data(rows=1000, partitions=4)\n", |
86 | | - " \n", |
87 | | - " # Analyze the generated data\n", |
88 | | - " analyze_data(synthetic_df)" |
89 | | - ] |
90 | | - } |
91 | | - ], |
92 | | - "metadata": { |
93 | | - "kernelspec": { |
94 | | - "display_name": ".venv", |
95 | | - "language": "python", |
96 | | - "name": "python3" |
97 | | - }, |
98 | | - "language_info": { |
99 | | - "codemirror_mode": { |
100 | | - "name": "ipython", |
101 | | - "version": 3 |
102 | | - }, |
103 | | - "file_extension": ".py", |
104 | | - "mimetype": "text/x-python", |
105 | | - "name": "python", |
106 | | - "nbconvert_exporter": "python", |
107 | | - "pygments_lexer": "ipython3", |
108 | | - "version": "3.13.2" |
109 | | - } |
110 | | - }, |
111 | | - "nbformat": 4, |
112 | | - "nbformat_minor": 2 |
113 | | -} |
0 commit comments