Add sample() method for full synthesis in microplex

MaxGhenis · claude · MaxGhenis · commit 31b6e6fdcb71 · 2025-12-27T19:34:18.000-05:00
microplex.sample(n) generates fully synthetic records: - Samples conditions from training distribution - Generates targets conditioned on sampled conditions Full synthesis results: - microplex: Best condition match (0.012 MMD) - samples from real - CT-GAN: Best joint distribution (0.096 MMD) - microplex competitive at 0.108 joint MMD, 3x faster than CT-GAN 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
diff --git a/benchmarks/results/full_synthesis_comparison.csv b/benchmarks/results/full_synthesis_comparison.csv
@@ -1,4 +1,5 @@
 method,cond_mmd,target_mmd,joint_mmd,time
-CT-GAN,0.11858314294642136,0.13823647660564373,0.1390349789821181,56.425487756729126
-TVAE,0.18734142598531364,0.16595096545339236,0.19125623040798687,24.558475971221924
-Gaussian Copula,0.11279594598807205,0.347873117998337,0.23121743396787886,2.4372987747192383
+microplex,0.012205728981483213,0.0801667374165846,0.10837475669093859,20.113948822021484
+CT-GAN,0.10443139822475064,0.05203332641526303,0.09616782962293756,55.82759714126587
+TVAE,0.17223485929557655,0.1421805932438464,0.16974297462602855,25.058035135269165
+Gaussian Copula,0.11279594598807205,0.347873117998337,0.23121743396787886,2.3078439235687256
diff --git a/benchmarks/results/imputation_comparison.csv b/benchmarks/results/imputation_comparison.csv
@@ -1,6 +1,6 @@
 method,target_mmd,target_energy,time,type
-NND.hotdeck (FILTER),0.0,0.00010076622397425794,0.7841610908508301,filtering
-Binning (FILTER),0.0,2.3532192952480102e-05,2.4988253116607666,filtering
-QRF+ZI (PREDICT),0.07725531470412278,0.03508081808408736,13.898022890090942,prediction
-microplex (JOINT),0.18174467283765103,0.1365223085269447,19.701925039291382,joint
-CT-GAN (JOINT*),0.04333717023742819,0.010986144843117884,36.72522306442261,joint
+NND.hotdeck (FILTER),0.0,0.00010076622397425794,0.6831917762756348,filtering
+Binning (FILTER),0.0,2.3532192952480102e-05,2.6791610717773438,filtering
+QRF+ZI (PREDICT),0.07725531470412278,0.03508081808408736,12.860822916030884,prediction
+microplex (JOINT),0.06617186281731197,0.02525602165985763,19.281394958496094,joint
+CT-GAN (JOINT*),0.04802335274033213,0.012547827654338928,35.70978116989136,joint
diff --git a/benchmarks/synthesis_modes_comparison.py b/benchmarks/synthesis_modes_comparison.py
@@ -259,8 +259,25 @@ def evaluate_full(synthetic, test_data, name):
 
 full_synthesis_results = []
 
-# microplex (can only do imputation, not full synthesis)
-print("\n[1] microplex - N/A for full synthesis (requires conditions)")
+# microplex (now supports full synthesis via sample())
+print("\n[1] microplex (JOINT - full synthesis via sample())...")
+try:
+    start = time.time()
+    model = Synthesizer(
+        target_vars=target_vars,
+        condition_vars=condition_vars,
+        n_layers=6, hidden_dim=64, zero_inflated=True,
+    )
+    model.fit(train_data, epochs=50, batch_size=256, verbose=False)
+    synthetic = model.sample(len(test_data), seed=42)
+
+    res = evaluate_full(synthetic, test_data, "microplex")
+    res["time"] = time.time() - start
+    full_synthesis_results.append(res)
+    print(f"  ✓ Cond MMD={res['cond_mmd']:.4f}, Target MMD={res['target_mmd']:.4f}, Joint MMD={res['joint_mmd']:.4f}")
+except Exception as e:
+    print(f"  ✗ {e}")
+    import traceback; traceback.print_exc()
 
 # CT-GAN
 print("\n[2] CT-GAN (JOINT - true full synthesis)...")
@@ -373,10 +390,12 @@ def evaluate_full(synthetic, test_data, name):
 SYNTHESIS MODES:
   IMPUTATION: Use when you have real demographics, need synthetic targets
     → Filtering methods (NND.hotdeck) excel here
+    → microplex.generate(conditions) for model-based
 
   FULL SYNTHESIS: Use when you need entirely synthetic microdata
-    → Joint methods (CT-GAN, TVAE) required
-    → microplex currently imputation-only
+    → microplex.sample(n) - samples conditions from training, generates targets
+    → CT-GAN/TVAE - generate both from scratch
+    → microplex has best condition match (samples real), CT-GAN best joint
 """)
 
 # Save results
diff --git a/src/microplex/synthesizer.py b/src/microplex/synthesizer.py
@@ -109,6 +109,7 @@ def __init__(
         self._train_target_std: Optional[torch.Tensor] = None  # Store target std for variance reg
         self._train_target_max: Optional[torch.Tensor] = None  # Store max for clipping calibration
         self._original_scale_stats: Optional[Dict[str, Dict[str, float]]] = None  # Original scale stats for clipping
+        self._training_data: Optional[pd.DataFrame] = None  # Store for full synthesis
 
     def fit(
         self,
@@ -137,6 +138,9 @@ def fit(
         Returns:
             self
         """
+        # Store training data for full synthesis mode
+        self._training_data = data[self.condition_vars + self.target_vars].copy()
+
         # Prepare data dict for transforms
         data_dict = {col: data[col].values for col in data.columns}
 
@@ -496,6 +500,44 @@ def generate(
 
         return result
 
+    def sample(
+        self,
+        n: int,
+        seed: Optional[int] = None,
+    ) -> pd.DataFrame:
+        """
+        Generate fully synthetic records (both conditions and targets).
+
+        For full synthesis mode - samples conditions from training distribution,
+        then generates targets conditioned on those.
+
+        Args:
+            n: Number of synthetic records to generate
+            seed: Random seed for reproducibility
+
+        Returns:
+            DataFrame with all variables (conditions + targets)
+        """
+        if not self.is_fitted_:
+            raise ValueError("Synthesizer not fitted. Call fit() first.")
+
+        if self._training_data is None:
+            raise ValueError(
+                "Full synthesis requires training data. "
+                "Re-fit with store_training_data=True or use generate() with conditions."
+            )
+
+        if seed is not None:
+            np.random.seed(seed)
+
+        # Sample conditions from training distribution (with replacement)
+        train_conditions = self._training_data[self.condition_vars]
+        sampled_idx = np.random.choice(len(train_conditions), size=n, replace=True)
+        conditions = train_conditions.iloc[sampled_idx].reset_index(drop=True)
+
+        # Generate targets conditioned on sampled conditions
+        return self.generate(conditions, seed=seed)
+
     def save(self, path: Union[str, Path]) -> None:
         """Save fitted model to disk."""
         if not self.is_fitted_: