diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..5c00379 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,48 @@ +name: Build & Publish to PyPI + +on: + push: + tags: ["v*"] + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + build: + name: Build sdist & wheel + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Build + run: | + pip install build + python -m build + + - uses: actions/upload-artifact@v4 + with: + name: dist + path: dist/* + + publish: + name: Publish to PyPI + needs: [build] + runs-on: ubuntu-latest + if: startsWith(github.ref, 'refs/tags/v') + permissions: + id-token: write + + steps: + - uses: actions/download-artifact@v4 + with: + name: dist + path: dist/ + + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index fcdaf29..29b48dc 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -16,7 +16,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.10", "3.11", "3.12"] + python-version: ["3.11", "3.12", "3.13"] steps: - name: Check out repository uses: actions/checkout@v4 diff --git a/CHANGELOG.md b/CHANGELOG.md index b46da29..fe2d913 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,33 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.2.2] - 2026-04-08 + +### Added +- Full GPU acceleration for the PQ pipeline (fit, transform, predict) via PyTorch + Triton kernels +- Triton JIT-compiled `_pq_assign_kernel` for cluster assignment — never materializes the N×K distance matrix +- GPU-accelerated `PQEncoder.fit()` with `device='gpu'|'auto'` parameter +- GPU-accelerated `PQEncoder.transform()` with automatic VRAM-aware batching +- GPU-accelerated `PQKMeans.fit()` and `.predict()` with automatic CPU fallback +- Early-stopping tolerance (`tol`) parameter for `PQKMeans` GPU training path +- New `_update_centers()` function with chunked histogram accumulation for bounded memory +- New scripts: `benchmark_1B_pipeline.py`, `benchmark_gpu_predict.py`, `cluster_smiles.py`, `k_selection_gpu.py` +- Comprehensive GPU test suite in `test_clustering.py` + +### Changed +- `PQEncoder.fit()` now accepts `device` parameter (`'cpu'`, `'gpu'`, `'auto'`) +- `PQEncoder.transform()` automatically uses GPU when available +- `PQKMeans` uses GPU path when CUDA/Triton are detected, with transparent CPU fallback +- MQN fingerprint dtype changed to `int16` for correctness +- Migrated from `tmap-silicon` + `faerun` to `tmap2` for visualization +- Rewrote `visualization.py` to use tmap2's `TMAP`, `TmapViz`, and chemistry utilities +- Removed `pandarallel` dependency (parallelism now handled by tmap2) +- Default Morgan fingerprint bits changed from 1024 to 2048 (tmap2 default) + +### Fixed +- Stale codebook cache bug in encoder +- PyTorch monkeypatch guard for CI compatibility + ## [0.2.1] - 2026-03-06 ### Added diff --git a/chelombus/__init__.py b/chelombus/__init__.py index 1ad1a59..9524403 100644 --- a/chelombus/__init__.py +++ b/chelombus/__init__.py @@ -47,7 +47,7 @@ def _cluster_io_not_available(*args, **kwargs): query_clusters_batch = _cluster_io_not_available sample_from_cluster = _cluster_io_not_available -__version__ = "0.2.1" +__version__ = "0.2.2" __all__ = [ # Core classes "DataStreamer", diff --git a/chelombus/utils/visualization.py b/chelombus/utils/visualization.py index 920a355..7a156eb 100644 --- a/chelombus/utils/visualization.py +++ b/chelombus/utils/visualization.py @@ -1,98 +1,23 @@ """TMAP visualization utilities for molecular datasets. Provides functions to create TMAPs from SMILES strings with customizable -molecular properties and fingerprint options. +molecular properties and fingerprint options. Built on top of tmap2. """ -from rdkit import Chem -from rdkit.Chem import Descriptors, rdMolDescriptors -from faerun import Faerun -import tmap as tm import argparse -import pandas as pd -from pandarallel import pandarallel -from chelombus.utils.fingerprints import FingerprintCalculator import numpy as np -from sklearn.neighbors import NearestNeighbors -from typing import Callable - -# Available molecular properties with display names and calculation functions -AVAILABLE_PROPERTIES: dict[str, tuple[str, Callable]] = { - 'hac': ('HAC', lambda mol: mol.GetNumHeavyAtoms()), - 'num_aromatic_atoms': ('Number Aromatic Atoms', lambda mol: sum(1 for atom in mol.GetAtoms() if atom.GetIsAromatic())), - 'num_hba': ('NumHBA', rdMolDescriptors.CalcNumHBA), - 'num_hbd': ('NumHBD', rdMolDescriptors.CalcNumHBD), - 'num_rings': ('Number of Rings', rdMolDescriptors.CalcNumRings), - 'mw': ('MW', Descriptors.ExactMolWt), - 'clogp': ('cLogP', Descriptors.MolLogP), - 'fsp3': ('Fraction Csp3', Descriptors.FractionCSP3), -} - -DEFAULT_PROPERTIES = ['hac', 'num_aromatic_atoms', 'num_hba', 'num_hbd', 'num_rings', 'mw', 'clogp', 'fsp3'] - - -def _mol_properties_from_smiles(smiles: str, properties: list[str]) -> tuple | None: - """Get molecular properties from a single SMILES string.""" - mol = Chem.MolFromSmiles(smiles) - if mol is None: - return None - - values = [] - for prop in properties: - display_name, calc_func = AVAILABLE_PROPERTIES[prop] - values.append(calc_func(mol)) - - return tuple(values) - - -def calc_properties(smiles: list[str], properties: list[str] | None = None) -> pd.DataFrame: - """Calculate selected molecular properties for a list of SMILES. - - Args: - smiles: List of SMILES strings. - properties: List of property identifiers to calculate. - If None, uses DEFAULT_PROPERTIES. - Available: hac, num_aromatic_atoms, num_hba, num_hbd, - num_rings, mw, clogp, fsp3 - - Returns: - DataFrame with 'smiles' column and selected property columns. - - Raises: - ValueError: If unknown property identifiers are provided. - """ - if properties is None: - properties = DEFAULT_PROPERTIES - - # Validate properties - invalid = set(properties) - set(AVAILABLE_PROPERTIES.keys()) - if invalid: - raise ValueError(f"Unknown properties: {invalid}. Available: {list(AVAILABLE_PROPERTIES.keys())}") - - pandarallel.initialize(progress_bar=False, verbose=0) - - dataframe = pd.DataFrame({'smiles': smiles}) - - # Get display names for columns - column_names = [AVAILABLE_PROPERTIES[prop][0] for prop in properties] - - dataframe[column_names] = dataframe['smiles'].apply( - lambda s: _mol_properties_from_smiles(s, properties) - ).parallel_apply(pd.Series) - - return dataframe +import pandas as pd +from typing import Sequence +from tmap import TMAP +from tmap.utils import fingerprints_from_smiles, molecular_properties, AVAILABLE_PROPERTIES -def _calculate_fingerprint(smiles: list[str], fp: str, bits: int = 1024, radius: int = 2) -> np.ndarray: - """Calculate fingerprints for a list of SMILES.""" - fp_calc = FingerprintCalculator() - fp_arr = fp_calc.FingerprintFromSmiles(smiles, fp=fp, fpSize=bits, radius=radius) - return fp_arr +DEFAULT_PROPERTIES = list(AVAILABLE_PROPERTIES) def create_tmap( smiles: list[str], fingerprint: str = 'morgan', - fingerprint_bits: int = 1024, + fingerprint_bits: int = 2048, fingerprint_radius: int = 2, properties: list[str] | None = None, tmap_name: str = 'my_tmap', @@ -100,199 +25,121 @@ def create_tmap( ) -> None: """Create a TMAP visualization from SMILES strings. - Uses LSH Forest for layout generation. - Args: smiles: List of SMILES strings to visualize. fingerprint: Fingerprint type ('morgan' or 'mqn'). - fingerprint_bits: Number of bits for fingerprint (default: 1024). + fingerprint_bits: Number of bits for Morgan fingerprint (default: 2048). fingerprint_radius: Radius for Morgan fingerprint (default: 2). properties: List of molecular properties to display as color channels. - If None, uses all default properties. - tmap_name: Name for the output TMAP file. + If None, uses all available properties. + tmap_name: Name for the output TMAP HTML file. k_neighbors: Number of neighbors for layout (default: 20). """ if properties is None: properties = DEFAULT_PROPERTIES - # Validate properties - invalid = set(properties) - set(AVAILABLE_PROPERTIES.keys()) - if invalid: - raise ValueError(f"Unknown properties: {invalid}. Available: {list(AVAILABLE_PROPERTIES.keys())}") - - # Calculate fingerprints - fp_arr = _calculate_fingerprint(smiles, fingerprint, fingerprint_bits, fingerprint_radius) - - # Calculate molecular properties - descriptors_df = calc_properties(smiles, properties) - - # Create TMAP fingerprints - tm_fingerprint = [tm.VectorUint(fp) for fp in fp_arr] - - # Build LSH Forest - lf = tm.LSHForest(fingerprint_bits) - lf.batch_add(tm_fingerprint) - lf.index() - - # Layout configuration - cfg = tm.LayoutConfiguration() - cfg.node_size = 1/30 - cfg.mmm_repeats = 2 - cfg.sl_extra_scaling_steps = 10 - cfg.k = k_neighbors - cfg.sl_scaling_type = tm.RelativeToAvgLength - x, y, s, t, _ = tm.layout_from_lsh_forest(lf, cfg) - - # Create labels - labels = descriptors_df['smiles'].tolist() - - # Get color data for selected properties - column_names = [AVAILABLE_PROPERTIES[prop][0] for prop in properties] - c = [descriptors_df[col].to_numpy() for col in column_names] - - # Plotting - f = Faerun( - view="front", - coords=False, - title="", - clear_color="#FFFFFF", - ) + _validate_properties(properties) - f.add_scatter( - tmap_name + "_TMAP", - { - "x": x, - "y": y, - "c": np.array(c), - "labels": labels, - }, - shader="smoothCircle", - point_scale=2.5, - max_point_size=20, - interactive=True, - series_title=column_names, - has_legend=True, - colormap=['rainbow'] * len(properties), - categorical=[False] * len(properties), - ) + # Compute fingerprints via tmap2 + fp_kwargs = {} + if fingerprint == 'morgan': + fp_kwargs = {'n_bits': fingerprint_bits, 'radius': fingerprint_radius} + + fps = fingerprints_from_smiles(smiles, fp_type=fingerprint, **fp_kwargs) + + # Fit TMAP + model = TMAP(metric='jaccard', n_neighbors=k_neighbors).fit(fps) + + # Build visualization + viz = model.to_tmapviz(include_edges=True) + viz.title = tmap_name + viz.background_color = "#FFFFFF" - f.add_tree(tmap_name + "_TMAP_tree", {"from": s, "to": t}, point_helper=tmap_name + "_TMAP") - f.plot(tmap_name + "_TMAP", template='smiles') + # Add molecular properties as color layers + props = molecular_properties(smiles, properties=properties) + for prop_name, values in props.items(): + viz.add_color_layout(prop_name, values, categorical=False, color='rainbow') + + # Add SMILES for structure rendering in tooltips + viz.add_smiles(smiles) + + viz.write_html(f"{tmap_name}.html") def representative_tmap( smiles: list[str], - cluster_ids: list[int | str] | None = None, + cluster_ids: Sequence[int | str] | None = None, fingerprint: str = 'mqn', - fingerprint_bits: int = 1024, + fingerprint_bits: int = 2048, + fingerprint_radius: int = 2, properties: list[str] | None = None, tmap_name: str = 'representative_tmap', k_neighbors: int = 21, ) -> None: - """Create a TMAP for representative molecules using edge list layout. + """Create a TMAP for representative molecules. - Uses scikit-learn's NearestNeighbors for edge list construction and - TMAP's layout_from_edge_list for positioning. Recommended for primary - TMAPs showing cluster representatives. + Recommended for primary TMAPs showing cluster representatives. Args: smiles: List of SMILES strings (typically cluster representatives). cluster_ids: Optional list of cluster IDs corresponding to each SMILES. - If provided, cluster ID will be shown in labels. + If provided, added as a categorical color channel. fingerprint: Fingerprint type ('mqn' recommended, or 'morgan'). - fingerprint_bits: Number of bits for fingerprint (default: 1024). + fingerprint_bits: Number of bits for Morgan fingerprint (default: 2048). + fingerprint_radius: Radius for Morgan fingerprint (default: 2). properties: List of molecular properties to display as color channels. - If None, uses all default properties. - tmap_name: Name for the output TMAP file. + If None, uses all available properties. + tmap_name: Name for the output TMAP HTML file. k_neighbors: Number of neighbors for KNN graph (default: 21). """ if properties is None: properties = DEFAULT_PROPERTIES - # Validate properties - invalid = set(properties) - set(AVAILABLE_PROPERTIES.keys()) - if invalid: - raise ValueError(f"Unknown properties: {invalid}. Available: {list(AVAILABLE_PROPERTIES.keys())}") - - # Calculate fingerprints (default MQN for representatives) - fp_arr = _calculate_fingerprint(smiles, fingerprint, fingerprint_bits) - - # Build edge list using scikit-learn - nbrs = NearestNeighbors(n_neighbors=k_neighbors, metric='euclidean').fit(fp_arr) - distances, indices = nbrs.kneighbors(fp_arr) - - edge_list = [] - for i in range(len(fp_arr)): - for neighbor_idx, dist in zip(indices[i, 1:], distances[i, 1:]): - edge_list.append((i, neighbor_idx, float(dist))) - - # Calculate molecular properties - descriptors_df = calc_properties(smiles, properties) - - # Layout configuration - cfg = tm.LayoutConfiguration() - cfg.node_size = 1/30 - cfg.mmm_repeats = 2 - cfg.sl_extra_scaling_steps = 10 - cfg.k = k_neighbors - cfg.sl_scaling_type = tm.RelativeToAvgLength - x, y, s, t, _ = tm.layout_from_edge_list(len(fp_arr), edge_list, cfg) - - # Create labels with optional cluster_id - labels = [] - for idx, row in descriptors_df.iterrows(): - smi = row['smiles'] - if cluster_ids is not None: - labels.append(f"{smi}" + "__" + f"Cluster ID: {cluster_ids[idx]}") - else: - labels.append(smi) + _validate_properties(properties) - # Get color data for selected properties - column_names = [AVAILABLE_PROPERTIES[prop][0] for prop in properties] - c = [descriptors_df[col].to_numpy() for col in column_names] + # Compute fingerprints via tmap2 + fp_kwargs = {} + if fingerprint == 'morgan': + fp_kwargs = {'n_bits': fingerprint_bits, 'radius': fingerprint_radius} - # Add cluster_id as a color channel if provided + fps = fingerprints_from_smiles(smiles, fp_type=fingerprint, **fp_kwargs) + + # Fit TMAP — use euclidean for dense descriptors like MQN + metric = 'euclidean' if fingerprint == 'mqn' else 'jaccard' + model = TMAP(metric=metric, n_neighbors=k_neighbors).fit(fps) + + # Build visualization + viz = model.to_tmapviz(include_edges=True) + viz.title = tmap_name + viz.background_color = "#FFFFFF" + + # Add molecular properties as color layers + props = molecular_properties(smiles, properties=properties) + for prop_name, values in props.items(): + viz.add_color_layout(prop_name, values, categorical=False, color='rainbow') + + # Add cluster ID as categorical color channel if cluster_ids is not None: - # Convert cluster_ids to numeric for coloring - unique_clusters = list(set(cluster_ids)) - cluster_to_idx = {c: i for i, c in enumerate(unique_clusters)} - cluster_colors = np.array([cluster_to_idx[cid] for cid in cluster_ids]) - c.append(cluster_colors) - column_names.append('Cluster ID') - properties = list(properties) + ['cluster_id'] # For categorical list - - # Plotting - f = Faerun( - view="front", - coords=False, - title="", - clear_color="#FFFFFF", - ) + viz.add_color_layout( + 'Cluster ID', + [str(cid) for cid in cluster_ids], + categorical=True, + color='tab20', + ) + + # Add SMILES for structure rendering in tooltips + viz.add_smiles(smiles) + + viz.write_html(f"{tmap_name}.html") - # Set categorical for cluster_id if present - categorical = [False] * (len(properties) - 1) + [True] if cluster_ids is not None else [False] * len(properties) - colormaps = ['rainbow'] * (len(properties) - 1) + ['tab20'] if cluster_ids is not None else ['rainbow'] * len(properties) - - f.add_scatter( - tmap_name + "_TMAP", - { - "x": x, - "y": y, - "c": np.array(c), - "labels": labels, - }, - shader="smoothCircle", - point_scale=2.5, - max_point_size=20, - interactive=True, - series_title=column_names, - has_legend=True, - colormap=colormaps, - categorical=categorical, - ) - f.add_tree(tmap_name + "_TMAP_tree", {"from": s, "to": t}, point_helper=tmap_name + "_TMAP") - f.plot(tmap_name + "_TMAP", template='smiles') +def _validate_properties(properties: list[str]) -> None: + """Raise ValueError if any property name is not recognized by tmap2.""" + invalid = set(properties) - set(AVAILABLE_PROPERTIES) + if invalid: + raise ValueError( + f"Unknown properties: {invalid}. Available: {AVAILABLE_PROPERTIES}" + ) def main(): @@ -301,11 +148,11 @@ def main(): description="Generate TMAP visualizations from SMILES", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=f""" -Available properties: {', '.join(AVAILABLE_PROPERTIES.keys())} +Available properties: {', '.join(AVAILABLE_PROPERTIES)} Examples: chelombus-tmap --smiles molecules.smi - chelombus-tmap --smiles molecules.smi --fingerprint mqn --properties hac,mw,clogp + chelombus-tmap --smiles molecules.smi --fingerprint mqn --properties mw,logp,qed chelombus-tmap --smiles molecules.smi --cluster-file clusters.csv """ ) @@ -314,16 +161,16 @@ def main(): parser.add_argument('--fingerprint', type=str, default='morgan', choices=['morgan', 'mqn'], help="Fingerprint type (default: morgan)") - parser.add_argument('--bits', type=int, default=1024, - help="Fingerprint bits (default: 1024)") + parser.add_argument('--bits', type=int, default=2048, + help="Morgan fingerprint bits (default: 2048)") parser.add_argument('--radius', type=int, default=2, help="Morgan fingerprint radius (default: 2)") parser.add_argument('--properties', type=str, default=None, - help=f"Comma-separated properties (default: all). Available: {', '.join(AVAILABLE_PROPERTIES.keys())}") + help=f"Comma-separated properties (default: all). Available: {', '.join(AVAILABLE_PROPERTIES)}") parser.add_argument('--output', type=str, default='my_tmap', help="Output TMAP name (default: my_tmap)") parser.add_argument('--cluster-file', type=str, default=None, - help="Optional CSV/parquet file with 'smiles' and 'cluster_id' columns for representative TMAP") + help="CSV/parquet file with 'smiles' and 'cluster_id' columns for representative TMAP") parser.add_argument('--representative', action='store_true', help="Use representative_tmap layout (edge list based)") @@ -336,7 +183,6 @@ def main(): # Load data if args.cluster_file: - # Load from CSV/parquet with cluster_ids if args.cluster_file.endswith('.parquet'): df = pd.read_parquet(args.cluster_file) elif args.cluster_file.endswith('.csv'): @@ -352,11 +198,11 @@ def main(): cluster_ids=cluster_ids, fingerprint=args.fingerprint, fingerprint_bits=args.bits, + fingerprint_radius=args.radius, properties=properties, tmap_name=args.output, ) else: - # Load from simple SMILES file with open(args.smiles, 'r') as file: smiles = [line.strip() for line in file.readlines() if line.strip()] @@ -365,6 +211,7 @@ def main(): smiles=smiles, fingerprint=args.fingerprint, fingerprint_bits=args.bits, + fingerprint_radius=args.radius, properties=properties, tmap_name=args.output, ) diff --git a/docs/api.md b/docs/api.md index 50ab3ed..c05995a 100644 --- a/docs/api.md +++ b/docs/api.md @@ -18,6 +18,12 @@ ::: chelombus.clustering.PyQKmeans.PQKMeans +## Visualization + +::: chelombus.utils.visualization.create_tmap + +::: chelombus.utils.visualization.representative_tmap + ## Cluster I/O ::: chelombus.utils.cluster_io diff --git a/docs/choosing-k.md b/docs/choosing-k.md index de866b5..5b54407 100644 --- a/docs/choosing-k.md +++ b/docs/choosing-k.md @@ -41,11 +41,24 @@ Benchmark configuration: AMD Ryzen 7, 64GB RAM, 10 PQk-means iterations per k. - **k = 100,000** if you need tighter clusters and can tolerate ~27% empty clusters. - Beyond 200K, over a third of clusters are empty — diminishing returns. +## GPU-accelerated k selection + +If you have a CUDA GPU, use `scripts/k_selection_gpu.py` for significantly faster sweeps: + +```bash +python scripts/k_selection_gpu.py \ + --pq-codes data/pq_codes.npy \ + --encoder models/encoder.joblib \ + --k-values 10000 50000 100000 200000 +``` + +The GPU path uses Triton kernels for the predict step and is typically 8-10x faster than CPU for large k values. + ## Scaling estimates Fit time scales linearly with both n (number of molecules) and k: -| Scenario | Estimated Fit Time | +| Scenario | CPU Fit Time | |---|---| | 1B molecules, k=50K | ~2.6 days | | 1B molecules, k=100K | ~5.2 days | diff --git a/docs/getting-started.md b/docs/getting-started.md index 9fcebb1..c8f843a 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -8,6 +8,16 @@ pip install chelombus ``` +### GPU Support (optional) + +GPU acceleration requires PyTorch and Triton. Install them separately for your CUDA version: + +```bash +pip install torch triton +``` + +When CUDA is available, `PQEncoder.fit()`, `.transform()`, `PQKMeans.fit()`, and `.predict()` automatically use the GPU. No code changes needed — the library falls back to CPU transparently if CUDA is not detected. + ### From Source ```bash @@ -25,17 +35,18 @@ pip install -e . from chelombus import DataStreamer, FingerprintCalculator, PQEncoder, PQKMeans # 1. Stream SMILES in chunks -streamer = DataStreamer(path='molecules.smi', chunksize=100000) +streamer = DataStreamer() +smiles_gen = streamer.parse_input('molecules.smi', chunksize=100000, smiles_col=0) # 2. Calculate MQN fingerprints fp_calc = FingerprintCalculator() -for smiles_chunk in streamer.parse_input(): +for smiles_chunk in smiles_gen: fingerprints = fp_calc.FingerprintFromSmiles(smiles_chunk, fp='mqn') # Save fingerprints... -# 3. Train PQ encoder on sample +# 3. Train PQ encoder (uses GPU automatically if available) encoder = PQEncoder(k=256, m=6, iterations=20) -encoder.fit(training_fingerprints) +encoder.fit(training_fingerprints, device='auto') # 4. Transform all fingerprints to PQ codes pq_codes = encoder.transform(fingerprints) @@ -45,19 +56,82 @@ clusterer = PQKMeans(encoder, k=100000) labels = clusterer.fit_predict(pq_codes) ``` +## GPU Acceleration + +When PyTorch and Triton are installed with CUDA support, the entire PQ pipeline runs on GPU. + +**GPU benchmarks on 10M Enamine MQN fingerprints** (RTX 4070 Ti SUPER 16GB, k=10,000): + +| Stage | Time | +|:---|---:| +| `encoder.fit()` (10M, m=6, k=256) | 12.1s | +| `encoder.transform()` (10M) | 1.8s | +| `clusterer.fit()` (10M, k=10K, 5 iters) | 15.4s | +| `clusterer.predict()` (10M) | 2.1s | +| **Total** | **31.4s** | + +At 1B scale with k=100K, the full pipeline completes in ~2.9 hours on the same GPU. + +### Explicit device control + +```python +# Force GPU +encoder.fit(X_train, device='gpu') + +# Force CPU +encoder.fit(X_train, device='cpu') + +# Auto-detect (default for fit; transform/predict auto-detect always) +encoder.fit(X_train, device='auto') +``` + +The GPU path uses VRAM-aware batching — it queries free GPU memory and sizes batches automatically so you don't need to worry about OOM errors. + +## Visualization + +Chelombus uses [tmap2](https://github.com/afloresep/tmap2) for interactive TMAP visualizations. + +### From Python + +```python +from chelombus import sample_from_cluster +from chelombus.utils.visualization import create_tmap, representative_tmap + +# Visualize molecules from a cluster +df = sample_from_cluster('results/', cluster_id=42, n=1000) +create_tmap( + smiles=df['smiles'].tolist(), + fingerprint='morgan', + properties=['mw', 'logp', 'qed', 'n_rings'], + tmap_name='cluster_42', +) + +# Representative TMAP with cluster IDs +representative_tmap( + smiles=rep_smiles, + cluster_ids=rep_cluster_ids, + fingerprint='mqn', + tmap_name='representatives', +) +``` + +### From CLI + +```bash +chelombus-tmap --smiles molecules.smi --fingerprint morgan --output my_tmap +chelombus-tmap --cluster-file representatives.csv --output rep_tmap +``` + ## Pipeline Scripts For large-scale processing, use the pipeline scripts in `scripts/`: | Script | Description | |---|---| -| `calculate_fingerprints.py` | Compute MQN fingerprints from SMILES | -| `train_encoder.py` | Train the PQ encoder on a sample | -| `generate_pqcodes.py` | Transform fingerprints to PQ codes | -| `fit_pqkmeans.py` | Fit PQk-means clustering | -| `assign_clusters.py` | Assign molecules to clusters | -| `select_k.py` | Hyperparameter sweep for choosing k | -| `run_pipeline.py` | Run the full pipeline end-to-end | +| `cluster_smiles.py` | End-to-end: SMILES to clustered parquet | +| `benchmark_1B_pipeline.py` | Full pipeline benchmark at billion scale | +| `benchmark_gpu_predict.py` | GPU vs CPU predict benchmarks | +| `k_selection_gpu.py` | GPU-accelerated k hyperparameter sweep | ## Testing diff --git a/docs/index.md b/docs/index.md index 12ce0d0..0d2efd8 100644 --- a/docs/index.md +++ b/docs/index.md @@ -18,19 +18,20 @@ SMILES → MQN Fingerprints → PQ Encoding → PQk-means Clustering → Nested - **Scalability**: Stream billions of molecules without loading everything into memory - **Efficiency**: Compress 42-dimensional MQN vectors to 6-byte PQ codes (28x compression) -- **Visualization**: Navigate from global overview to individual molecules in two clicks -- **Accessibility**: Runs on commodity hardware (tested: AMD Ryzen 7, 64GB RAM) +- **GPU Acceleration**: Full CUDA pipeline via PyTorch + Triton kernels for fit, transform, and predict +- **Visualization**: Interactive TMAPs powered by [tmap2](https://github.com/afloresep/tmap2) — navigate from global overview to individual molecules in two clicks +- **Accessibility**: Runs on commodity hardware (tested: AMD Ryzen 7, 64GB RAM); GPU optional ## Project Structure ``` chelombus/ ├── chelombus/ -│ ├── encoder/ # Product Quantization encoder -│ ├── clustering/ # PQk-means wrapper +│ ├── encoder/ # Product Quantization encoder (CPU + GPU) +│ ├── clustering/ # PQk-means wrapper + Triton kernels │ ├── streamer/ # Memory-efficient data streaming -│ └── utils/ # Fingerprints, visualization, helpers -├── scripts/ # Pipeline scripts +│ └── utils/ # Fingerprints, visualization, cluster I/O +├── scripts/ # Pipeline and benchmark scripts ├── examples/ # Tutorial notebooks └── tests/ # Unit tests ``` diff --git a/examples/enamine_1B_clustering.ipynb b/examples/enamine_1B_clustering.ipynb index b550bee..ea25073 100644 --- a/examples/enamine_1B_clustering.ipynb +++ b/examples/enamine_1B_clustering.ipynb @@ -89,18 +89,10 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "67096f1e", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Shape: (10000000, 42), MB: 840.0\n" - ] - } - ], + "outputs": [], "source": [ "import numpy as np \n", "\n", @@ -118,18 +110,10 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "3108cd8f", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Training PQ-codes: 100%|██████████| 6/6 [15:30<00:00, 155.03s/it]\n" - ] - } - ], + "outputs": [], "source": [ "from chelombus import PQEncoder\n", "pq_encoder = PQEncoder(k=256, m=6, iterations=10)\n", @@ -146,7 +130,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "ef22b025", "metadata": {}, "outputs": [], @@ -174,20 +158,10 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "10b8a8c1", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The shape of the codebook is: (6, 256, 7)\n", - "Is the encoder trained? True\n", - "The total number of lables [164 183 183 ... 9 20 86] is: 10000000\n" - ] - } - ], + "outputs": [], "source": [ "print(\"The shape of the codebook is: \", pq_encoder.codewords.shape)\n", "print(\"Is the encoder trained? \", pq_encoder.encoder_is_trained)\n", @@ -205,32 +179,10 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "763f58a5", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Generating PQ-codes: 100%|██████████| 6/6 [00:03<00:00, 1.50it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Shape: (10000000, 6), MB: 60.0\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], + "outputs": [], "source": [ "pq_codes = pq_encoder.transform(chunk)\n", "print(f\"Shape: {pq_codes.shape}, MB: {pq_codes.nbytes/1e6}\")\n" @@ -247,37 +199,10 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "c8af696e", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[23 0 0 0 0 0 0 1 4 1 0 29 13 1 0 9 8 0 8 7 6 1 1 0\n", - " 0 5 4 3 0 11 6 0 0 0 1 2 0 0 0 0 0 0] (1000000, 42)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Generating PQ-codes: 100%|██████████| 6/6 [00:00<00:00, 15.82it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[ 3 140 163 49 11 1]\n", - "[23 0 0 0 0 0 0 1 4 1 0 29 13 0 0 8 8 0 8 8 5 1 1 0\n", - " 0 4 3 3 0 11 6 0 0 0 1 2 0 0 0 0 0 0] (1000000, 42)\n", - "Reconversion process:\n", - "Reconstruction fidelity: 92.02%\n" - ] - } - ], + "outputs": [], "source": [ "# Load a different chunk\n", "mqn_chunk_2 = np.load('/mnt/samsung_2tb/tmp/enamine-chunk_00001.npy')\n", @@ -368,29 +293,7 @@ "execution_count": null, "id": "1f2e9277", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([49161, 13845, 59452, 70857, 92866, 53390, 86229, 76216, 10800,\n", - " 35907, 28614, 36577, 37133, 7373, 78079, 86816, 32896, 54950,\n", - " 50991, 61871, 41623, 62782, 13150, 18079, 17881, 90621, 16919,\n", - " 89129, 41955, 51549, 28733, 6753, 5147, 10976, 94913, 24994,\n", - " 41253, 39846, 23494, 39025, 92625, 32737, 32737, 92866, 10623,\n", - " 66455, 41056, 32782, 60229, 86723, 70828, 25645, 19690, 66414,\n", - " 57948, 62575, 37133, 79107, 80172, 25645, 42460, 87881, 64548,\n", - " 84159, 22125, 75078, 86370, 85735, 45628, 55806, 71367, 31446,\n", - " 7686, 2500, 65864, 95833, 19290, 87959, 84159, 86816, 90085,\n", - " 60947, 17623, 66380, 68923, 12279, 32309, 55541, 30332, 76162,\n", - " 94913, 18079, 79672, 24077, 50207, 88405, 33924, 89928, 25457,\n", - " 92866])" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "kmeans.predict(pq_codes[:100])\n" ] @@ -400,35 +303,7 @@ "execution_count": null, "id": "00100a94", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Generating PQ-codes: 100%|██████████| 6/6 [00:00<00:00, 13.59it/s]\n", - "Generating PQ-codes: 100%|██████████| 6/6 [00:00<00:00, 14.19it/s]\n", - "Generating PQ-codes: 100%|██████████| 6/6 [00:00<00:00, 13.64it/s]\n", - "Generating PQ-codes: 100%|██████████| 6/6 [00:00<00:00, 14.00it/s]\n", - "Generating PQ-codes: 100%|██████████| 6/6 [00:00<00:00, 13.99it/s]\n", - "Generating PQ-codes: 100%|██████████| 6/6 [00:00<00:00, 13.34it/s]\n", - "Generating PQ-codes: 100%|██████████| 6/6 [00:00<00:00, 14.17it/s]\n", - "Generating PQ-codes: 100%|██████████| 6/6 [00:00<00:00, 14.37it/s]\n", - "Generating PQ-codes: 100%|██████████| 6/6 [00:00<00:00, 14.33it/s]\n", - "Generating PQ-codes: 100%|██████████| 6/6 [00:00<00:00, 13.97it/s]\n", - "Generating PQ-codes: 100%|██████████| 6/6 [00:00<00:00, 14.26it/s]\n", - "Generating PQ-codes: 100%|██████████| 6/6 [00:00<00:00, 14.07it/s]\n", - "Generating PQ-codes: 100%|██████████| 6/6 [00:00<00:00, 14.15it/s]\n", - "Generating PQ-codes: 100%|██████████| 6/6 [00:00<00:00, 13.54it/s]\n", - "Generating PQ-codes: 100%|██████████| 6/6 [00:00<00:00, 13.23it/s]\n", - "Generating PQ-codes: 100%|██████████| 6/6 [00:00<00:00, 13.58it/s]\n", - "Generating PQ-codes: 100%|██████████| 6/6 [00:00<00:00, 14.01it/s]\n", - "Generating PQ-codes: 100%|██████████| 6/6 [00:00<00:00, 13.85it/s]\n", - "Generating PQ-codes: 100%|██████████| 6/6 [00:00<00:00, 14.38it/s]\n", - "Generating PQ-codes: 100%|██████████| 6/6 [00:00<00:00, 14.25it/s]\n", - "Generating PQ-codes: 100%|██████████| 6/6 [00:00<00:00, 14.50it/s]\n" - ] - } - ], + "outputs": [], "source": [ "\n", "# Cluster the file with the SMILES strings\n", @@ -451,11 +326,126 @@ " df.to_parquet(os.path.join(output_path, f'clustered_chunk_{i}.parquet'), index=False)\n", " \n" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d670042", + "metadata": {}, + "outputs": [], + "source": [ + "# Cluster the file with the SMILES strings\n", + "from chelombus import PQEncoder, DataStreamer, FingerprintCalculator\n", + "from chelombus.clustering.PyQKmeans import PQKMeans\n", + "import pyarrow as pa, pyarrow.parquet as pq\n", + "from chelombus.utils import format_time\n", + "encoder = PQEncoder.load('../models/paper_encoder.joblib')\n", + "clusterer = PQKMeans.load('../models/paper_clusterer.joblib')\n", + "stream = DataStreamer()\n", + "fp_calc = FingerprintCalculator()\n", + "import timeit\n", + "\n", + "for i, chunk in enumerate(stream.parse_input(\n", + " '/mnt/10tb_hdd/cleaned_enamine_10b/output_file_0.cxsmiles',\n", + " chunksize=1_000_000, smiles_col=1, verbose=0,\n", + ")):\n", + " fps = fp_calc.FingerprintFromSmiles(chunk, 'mqn')\n", + " t = timeit.timeit()\n", + " pq_codes = encoder.transform(fps, verbose=0, device='gpu') \n", + " print(\"GPU transform: \", timeit.timeit() - t)\n", + " t = timeit.timeit()\n", + " pq_codes = encoder.transform(fps, verbose=0, device='cpu') \n", + " print(\"CPU transform: \", format_time(timeit.timeit() - t))\n", + " t = timeit.timeit()\n", + " labels = clusterer.predict(pq_codes, device='cpu') \n", + " print(\"CPU predict: \", format_time(timeit.timeit() - t))\n", + " t = timeit.timeit()\n", + " labels = clusterer.predict(pq_codes, device='gpu') \n", + " print(\"GPU predict: \", format_time(timeit.timeit() - t))\n", + " # table = pa.table({\"smiles\": chunk, \"cluster_id\": labels})\n", + " # pq.write_table(table, f'/mnt/samsung_2tb/tmp/chunk_{i:05d}.parquet')\n" + ] + }, + { + "cell_type": "markdown", + "id": "fa294f09", + "metadata": {}, + "source": [ + "# Visualization with TMAP\n", + "\n", + "After clustering, we can create interactive TMAPs to explore the chemical space. The simplest approach: sample representatives from each cluster, compute MQN fingerprints, and let `TMAP` handle the layout." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55a7899a", + "metadata": {}, + "outputs": [], + "source": [ + "from chelombus import sample_from_cluster\n", + "from chelombus.utils.visualization import representative_tmap\n", + "# Sample representatives from several clusters\n", + "results_dir = '/mnt/2tb/tmp/' # Path to your clustered parquet files\n", + "rep_smiles = []\n", + "rep_cids = []\n", + "\n", + "for cid in range(100): # First 100 clusters\n", + " df = sample_from_cluster(results_dir, cluster_id=cid, n=1)\n", + " if len(df) > 0:\n", + " rep_smiles.extend(df['smiles'].tolist())\n", + " rep_cids.extend([cid] * len(df))\n", + "\n", + "print(f\"Collected {len(rep_smiles)} representatives from {len(set(rep_cids))} clusters\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "adf5d309", + "metadata": {}, + "outputs": [], + "source": [ + "# Create the representative TMAP, MQN + euclidean is the natural choice for representatives\n", + "representative_tmap(\n", + " smiles=rep_smiles,\n", + " cluster_ids=rep_cids,\n", + " fingerprint='mqn',\n", + " properties=['mw', 'logp', 'qed', 'n_rings', 'hba', 'hbd'],\n", + " tmap_name='enamine_representative',\n", + " k_neighbors=20,\n", + ")\n", + "\n", + "print(\"Representative TMAP generated: enamine_representative.html\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "269e44c0", + "metadata": {}, + "source": [ + "You can also build a TMAP directly using tmap2 if you need more control:\n", + "\n", + "```python\n", + "from tmap import TMAP\n", + "from tmap.utils import fingerprints_from_smiles, molecular_properties\n", + "\n", + "fps = fingerprints_from_smiles(rep_smiles, fp_type='mqn')\n", + "model = TMAP(metric='euclidean', n_neighbors=20).fit(fps)\n", + "\n", + "viz = model.to_tmapviz(include_edges=True)\n", + "props = molecular_properties(rep_smiles, properties=['mw', 'logp', 'qed'])\n", + "for name, values in props.items():\n", + " viz.add_color_layout(name, values, categorical=False, color='rainbow')\n", + "viz.add_smiles(rep_smiles)\n", + "viz.write_html('custom_tmap.html')\n", + "```" + ] } ], "metadata": { "kernelspec": { - "display_name": "spiq", + "display_name": "venv (3.12.6)", "language": "python", "name": "python3" }, @@ -469,7 +459,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.2" + "version": "3.12.6" } }, "nbformat": 4, diff --git a/examples/tutorial.ipynb b/examples/tutorial.ipynb index d291536..a1e7b58 100644 --- a/examples/tutorial.ipynb +++ b/examples/tutorial.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -33,18 +33,9 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Fingerprints shape: (4, 1024)\n", - "Fingerprint 1 [0 0 1 ... 0 0 0]\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# Define a list of SMILES strings\n", "smiles_list = [\"CCO\", \"C1CCCCC1\", \"O=C=O\", \"O=C=O\"]\n", @@ -72,18 +63,9 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - " Fingerprints calculated: 100,000" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# Import iterator method\n", "ds = DataStreamer()\n", @@ -108,17 +90,9 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Fingerprints calculated: 100,000" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "from chelombus.utils.helper_functions import save_chunk\n", "\n", @@ -141,17 +115,9 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(100000, 1024) 102.4 MB\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# At this step we could load Fingerprints computed in the previous step -likely what you'd do for large datasets\n", "# or in case it fits in memory just pass the fingerprint matrix\n", @@ -177,17 +143,9 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Training PQ-encoder: 100%|██████████| 4/4 [00:12<00:00, 3.12s/it]\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "from chelombus.encoder.encoder import PQEncoder\n", "\n", @@ -210,19 +168,9 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The shape of the codebook is: (4, 256, 256)\n", - "Is the encoder trained? True\n", - "The lables: KMeans(max_iter=10, n_clusters=256) have length: 100000\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "print(\"The shape of the codebook is: \", pq_encoder.codewords.shape)\n", "print(\"Is the encoder trained? \", pq_encoder.encoder_is_trained)\n", @@ -239,38 +187,9 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "10,000 fingerprints of 1024 dimensions to be transformed into PQ-codes\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Generating PQ-codes: 100%|██████████| 4/4 [00:00<00:00, 85.00it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Transforming 10,000 fingeprints took 0.05 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "import numpy as np \n", "import time\n", @@ -294,43 +213,30 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[137, 84, 139, 81],\n", - " [137, 36, 39, 81],\n", - " [137, 84, 139, 53],\n", - " ...,\n", - " [250, 44, 200, 16],\n", - " [250, 44, 200, 238],\n", - " [144, 253, 218, 166]], shape=(10000, 4), dtype=uint8)" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from chelombus import PQEncoder\n", + "\n", + "encoder = PQEncoder.load('/home/afloresep/work/Chelombus/models/paper_encoder.joblib')\n", + "encoder.transform(np.random.randint(0, 200, (10_000_000, 42)))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "X_pq_code\n" ] }, { "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of unique 4-dim vectors: 6116\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "import numpy as np\n", "# Count unique rows\n", @@ -348,18 +254,9 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Original input of shape: (10000, 1024) and size of 10,240,000 bytes is now transformed into shape (10000, 4) and size of 40,000 bytes\n", - "This is 256 times more memory efficient\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "print(f\"Original input of shape: {X_test.shape} and size of {X_test.nbytes:,} bytes is now transformed into shape {X_pq_code.shape} and size of {X_pq_code.nbytes:,} bytes\")\n", "print(f\"This is {int(X_test.nbytes / X_pq_code.nbytes)} times more memory efficient\")\n" @@ -376,19 +273,9 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Assigned 10000 molecules to 100 clusters\n", - "Labels shape: (10000,)\n", - "Sample labels: [83 76 35 76 60 94 94 94 76 94]\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "from chelombus import PQKMeans\n", "\n", @@ -414,22 +301,9 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Cluster centers shape: (100, 4)\n", - "\n", - "Cluster size statistics:\n", - " Min cluster size: 4\n", - " Max cluster size: 346\n", - " Mean cluster size: 100.0\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# Cluster centers are stored as PQ codes\n", "print(f\"Cluster centers shape: {clusterer.cluster_centers_.shape}\")\n", @@ -451,17 +325,9 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Predicted labels: [47 27 20 47 20 27 33 27 47 33]\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# Fit on training data\n", "clusterer2 = PQKMeans(encoder=pq_encoder, k=50, iteration=15)\n", @@ -483,17 +349,9 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loaded model predictions match: True\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# Save the trained clusterer\n", "clusterer2.save(\"data/pqkmeans_model.joblib\")\n", @@ -517,17 +375,9 @@ }, { "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Created 2 parquet chunks\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# First, let's create sample parquet files for demonstration\n", "import os\n", @@ -551,31 +401,14 @@ }, { "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of clusters: 100\n", - "Total molecules: 10,000\n", - "\n", - "Top 5 largest clusters:\n", - " cluster_id molecule_count\n", - "76 76 346\n", - "93 93 285\n", - "58 58 273\n", - "73 73 239\n", - "60 60 234\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "from chelombus import get_cluster_stats, query_cluster, sample_from_cluster\n", "\n", "# Get statistics for all clusters\n", - "stats = get_cluster_stats(\"data/results/\")\n", + "stats = get_cluster_stats(\"/mnt/10tb_hdd/enamine_clustered_october_2025/build_clickhouse_db_enamine_files\", \"2B_cluster_id\")\n", "print(f\"Number of clusters: {len(stats)}\")\n", "print(f\"Total molecules: {stats['molecule_count'].sum():,}\")\n", "print(f\"\\nTop 5 largest clusters:\")\n", @@ -584,23 +417,9 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Cluster 0 contains 51 molecules\n", - " smiles cluster_id\n", - "0 CC1=CC=CC=C1N1N=NN=C1SCC(=O)N1CCN(C2=CC=C(F)C=... 0\n", - "1 [N-]=[N+]=NCCCC(=O)CSC1=NN=C(C2=CC=C(F)C=C2)N1... 0\n", - "2 O=C(O)C1=CN=C(SCC(=O)N2CCSC3=CC=CC=C32)N1C1=CC... 0\n", - "3 CC(C1=NN=C(SCCC2=CC=NC=C2)N1C1=CC=C(F)C=C1)N1C... 0\n", - "4 CC(C1=NN=C(SCC2CC(=O)N(C)C2)N1C1=CC=C(F)C=C1)N... 0\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# Query all molecules from a specific cluster\n", "cluster_id = stats.iloc[0]['cluster_id'] # Get largest cluster\n", @@ -611,18 +430,9 @@ }, { "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Random sample of 5 molecules from cluster 0:\n", - "['CCC(C1=NN=C(SCC(=O)[C@]2(C)C[C@H]2CC)N1C1=CC=C(OC)C=C1)N(C)C', 'CC(C1=NN=C(SCCC2=CC=NC=C2)N1C1=CC=C(F)C=C1)N1CCCCC1', 'CC(C)OC(=O)CCSC1=NN=C(C(C)N2CCCCC2)N1C1=CC=C(Cl)C=C1', 'NC1=NC(Cl)=CC(CSC2=NN=C(CN3CCCCC3)N2C2=CC=C(F)C=C2)=N1', 'CC1=CC(C(=O)CSC2=NN=CN2C2=CC=CC=C2)=C(C)N1C1=CC=C(F)C=C1']\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# Get a random sample from a cluster (useful for previewing)\n", "sample = sample_from_cluster(\"data/results/\", cluster_id=int(cluster_id), n=5, random_state=42)\n", @@ -632,17 +442,9 @@ }, { "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Exported 51 molecules to data/cluster_export.parquet\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "from chelombus import export_cluster\n", "\n", @@ -658,144 +460,21 @@ { "cell_type": "markdown", "metadata": {}, - "source": [ - "### Visualization with TMAP\n", - "\n", - "Create interactive tree maps (TMAPs) to visualize molecular clusters. TMAPs use LSH Forest to find nearest neighbors and create a tree-like layout." - ] + "source": "### Visualization with TMAP\n\nCreate interactive tree maps (TMAPs) to visualize molecular clusters. Chelombus uses [tmap2](https://github.com/afloresep/tmap2) under the hood, just pass SMILES and it handles fingerprints, layout, and rendering." }, { "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/afloresep/miniforge3/envs/tmapnew/lib/python3.12/site-packages/faerun/faerun.py:409: RuntimeWarning: invalid value encountered in divide\n", - " data_c[s] = (data_c[s] - min_c[s]) / (max_c[s] - min_c[s])\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "./my_tmap_TMAP.html
" - ], - "text/plain": [ - "/home/afloresep/work/Chelombus/examples/my_tmap_TMAP.html" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "TMAP generated: sample_tmap_TMAP.html\n" - ] - } - ], - "source": [ - "from chelombus.utils.visualization import create_tmap\n", - "# Create a TMAP for a subset of molecules (for demonstration)\n", - "sample_smiles = smiles[:1000]\n", - "\n", - "# This will generate an interactive HTML file\n", - "create_tmap(\n", - " smiles=sample_smiles, \n", - " fingerprint = 'morgan', # or 'mqn'\n", - " fingerprint_bits= 1024, # for 'morgan'\n", - " fingerprint_radius = 2, # for 'morgan'\n", - " # properties= [\"MW\", \"HAC\"], # default is all properties included\n", - " tmap_name= 'my_tmap',\n", - " k_neighbors= 20,\n", - " )\n", - "\n", - "print(\"TMAP generated: sample_tmap_TMAP.html\")\n" - ] + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": "from chelombus.utils.visualization import create_tmap\n\n# Create a TMAP for a subset of molecules\nsample_smiles = smiles[:1000]\n\n# This will generate an interactive HTML file\ncreate_tmap(\n smiles=sample_smiles,\n fingerprint='morgan',\n properties=['mw', 'logp', 'qed', 'n_rings'],\n tmap_name='my_tmap',\n k_neighbors=20,\n)\n\nprint(\"TMAP generated: my_tmap.html\")" }, { "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/afloresep/miniforge3/envs/tmapnew/lib/python3.12/site-packages/faerun/faerun.py:409: RuntimeWarning: invalid value encountered in divide\n", - " data_c[s] = (data_c[s] - min_c[s]) / (max_c[s] - min_c[s])\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "./my_tmap_TMAP.html
" - ], - "text/plain": [ - "/home/afloresep/work/Chelombus/examples/my_tmap_TMAP.html" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# representative_tmap for the first tmap\n", - "import numpy as np\n", - "from chelombus.utils.visualization import representative_tmap\n", - "representative_tmap(\n", - " smiles=smiles[:2000], \n", - " cluster_ids = list(np.random.randint(0, 100000, 2000)), # cluster_ids, this generates the label visible when clicking on the node\n", - " tmap_name= 'my_tmap',\n", - " k_neighbors= 20,\n", - " #properties = [\"MW\"], default is all properties included\n", - " )\n" - ] + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": "# Representative TMAP with cluster IDs\nimport numpy as np\nfrom chelombus.utils.visualization import representative_tmap\n\nrepresentative_tmap(\n smiles=smiles[:2000],\n cluster_ids=list(np.random.randint(0, 100000, 2000)),\n fingerprint='mqn',\n tmap_name='representative_tmap',\n k_neighbors=20,\n)\n\nprint(\"Representative TMAP generated: representative_tmap.html\")" }, { "cell_type": "markdown", @@ -832,7 +511,7 @@ ], "metadata": { "kernelspec": { - "display_name": "tmapnew", + "display_name": "venv (3.12.6)", "language": "python", "name": "python3" }, @@ -846,9 +525,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.12" + "version": "3.12.6" } }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 7aa2ca5..7dd84a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,20 +4,19 @@ build-backend = "setuptools.build_meta" [project] name = "chelombus" -version = "0.2.1" +version = "0.2.2" description = "Billion-scale molecular clustering and visualization using Product Quantization and nested TMAPs." readme = "README.md" license = {text = "MIT"} authors = [ { name = "Alejandro Flores", email = "afloresep01@gmail.com" } ] -requires-python = ">=3.9" +requires-python = ">=3.11" classifiers = [ "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", "Intended Audience :: Science/Research", @@ -30,9 +29,7 @@ dependencies = [ "rdkit>=2022.03", "tqdm>=4.60.0", "scikit-learn>=1.0.0", - "tmap-silicon>=1.0.19", - "pandarallel>=1.6.0", - "faerun>=0.4.0", + "tmap2>=0.2.0", "pyarrow>=10.0.0", "duckdb>=0.9.0", "pqkmeans", diff --git a/requirements.txt b/requirements.txt index 5a15640..e8d61c2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,9 +13,8 @@ numba>=0.57.0 pyarrow>=10.0.0 duckdb>=0.9.0 -# Visualization (optional) -# tmap>=1.0.0 -# faerun>=0.4.0 +# Visualization +tmap2>=0.2.0 # Development pytest>=7.0.0