diff --git a/experiments/ethereum-repo-clusters/.gitignore b/experiments/ethereum-repo-clusters/.gitignore
new file mode 100644
index 00000000..58ae5f3c
--- /dev/null
+++ b/experiments/ethereum-repo-clusters/.gitignore
@@ -0,0 +1,38 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Environment
+.env
+.venv
+env/
+venv/
+ENV/
+
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+
+# Project specific
+data/
+output/ 
\ No newline at end of file
diff --git a/experiments/ethereum-repo-clusters/CategorySummary.ipynb b/experiments/ethereum-repo-clusters/CategorySummary.ipynb
new file mode 100644
index 00000000..9ca79d6b
--- /dev/null
+++ b/experiments/ethereum-repo-clusters/CategorySummary.ipynb
@@ -0,0 +1,531 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "9c0861ae-d89b-4f21-a743-f5a77efa7648",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "add99052-fecf-4130-9cc5-b7413c643864",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>repo_artifact_id</th>\n",
+       "      <th>project_id</th>\n",
+       "      <th>project_name</th>\n",
+       "      <th>display_name</th>\n",
+       "      <th>repo_artifact_namespace</th>\n",
+       "      <th>repo_artifact_name</th>\n",
+       "      <th>created_at</th>\n",
+       "      <th>updated_at</th>\n",
+       "      <th>star_count</th>\n",
+       "      <th>fork_count</th>\n",
+       "      <th>...</th>\n",
+       "      <th>is_actively_maintained</th>\n",
+       "      <th>final_recommendation</th>\n",
+       "      <th>processing_timestamp</th>\n",
+       "      <th>summary</th>\n",
+       "      <th>readme_status</th>\n",
+       "      <th>protocol_architect</th>\n",
+       "      <th>ecosystem_analyst</th>\n",
+       "      <th>security_researcher</th>\n",
+       "      <th>user_experience_advocate</th>\n",
+       "      <th>governance_specialist</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>jXXy/fnXRva/c1jf/Weav9O3pWDHf/lVArjj0/oteUM=</td>\n",
+       "      <td>KLkMfahLmIEtzAbihkJ4U9p0e/3zWn0iN6xBrwN++lU=</td>\n",
+       "      <td>ethereum-attestation-service</td>\n",
+       "      <td>Ethereum Attestation Service</td>\n",
+       "      <td>ethereum-attestation-service</td>\n",
+       "      <td>eas-docs-site</td>\n",
+       "      <td>2022-11-09 19:39:56.000 UTC</td>\n",
+       "      <td>2025-06-02 15:51:08.000 UTC</td>\n",
+       "      <td>17</td>\n",
+       "      <td>39</td>\n",
+       "      <td>...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Developer Experience Tools</td>\n",
+       "      <td>2025-06-06T00:55:35.737447</td>\n",
+       "      <td>The project provides documentation for the Eth...</td>\n",
+       "      <td>SUCCESS</td>\n",
+       "      <td>Developer Experience Tools</td>\n",
+       "      <td>Developer Experience Tools</td>\n",
+       "      <td>Developer Experience Tools</td>\n",
+       "      <td>Developer Experience Tools</td>\n",
+       "      <td>Application-Specific &amp; Niche Tools</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Ymt6ZmVh75JL7ml3IM9hU32qFd+GB84kXijLttRFS+w=</td>\n",
+       "      <td>4JnmAZ2ikJpcjoRMX+2ZLxAQ10don2FQd6yPyDaBZ20=</td>\n",
+       "      <td>erigontech</td>\n",
+       "      <td>erigontech</td>\n",
+       "      <td>erigontech</td>\n",
+       "      <td>gmp-wasm</td>\n",
+       "      <td>2020-12-16 08:27:02.000 UTC</td>\n",
+       "      <td>2025-03-24 16:40:59.000 UTC</td>\n",
+       "      <td>17</td>\n",
+       "      <td>4</td>\n",
+       "      <td>...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Cryptography &amp; Primitives</td>\n",
+       "      <td>2025-06-06T00:55:27.645719</td>\n",
+       "      <td>The GNU MP Library provides arbitrary precisio...</td>\n",
+       "      <td>SUCCESS</td>\n",
+       "      <td>Cryptography &amp; Primitives</td>\n",
+       "      <td>Cryptography &amp; Primitives</td>\n",
+       "      <td>Cryptography &amp; Primitives</td>\n",
+       "      <td>Cryptography &amp; Primitives</td>\n",
+       "      <td>Cryptography &amp; Primitives</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>sTL/I78T3P6uyVN+En480uSHiTXT7UdHPmKQlvWxCkc=</td>\n",
+       "      <td>4JnmAZ2ikJpcjoRMX+2ZLxAQ10don2FQd6yPyDaBZ20=</td>\n",
+       "      <td>erigontech</td>\n",
+       "      <td>erigontech</td>\n",
+       "      <td>erigontech</td>\n",
+       "      <td>diagnostics</td>\n",
+       "      <td>2023-02-22 11:05:42.000 UTC</td>\n",
+       "      <td>2025-04-25 07:42:52.000 UTC</td>\n",
+       "      <td>17</td>\n",
+       "      <td>21</td>\n",
+       "      <td>...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Infrastructure &amp; Node Operations</td>\n",
+       "      <td>2025-06-06T00:55:20.010674</td>\n",
+       "      <td>The Erigon Diagnostics System is a web applica...</td>\n",
+       "      <td>SUCCESS</td>\n",
+       "      <td>Infrastructure &amp; Node Operations</td>\n",
+       "      <td>Infrastructure &amp; Node Operations</td>\n",
+       "      <td>DeFi Security &amp; Monitoring</td>\n",
+       "      <td>Infrastructure &amp; Node Operations</td>\n",
+       "      <td>Infrastructure &amp; Node Operations</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>9C23r6x0hqtbR/lB/1nYpc5KVgCtm4ga+lOJa4gd2cY=</td>\n",
+       "      <td>Fs/BFdYMfeuzzzWPOX3dtOA6Z4AOJsB2eO2JIoZEzUo=</td>\n",
+       "      <td>ensdomains</td>\n",
+       "      <td>ENS</td>\n",
+       "      <td>ensdomains</td>\n",
+       "      <td>court</td>\n",
+       "      <td>2018-05-02 19:41:02.000 UTC</td>\n",
+       "      <td>2025-05-20 03:41:25.000 UTC</td>\n",
+       "      <td>17</td>\n",
+       "      <td>7</td>\n",
+       "      <td>...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Application-Specific &amp; Niche Tools</td>\n",
+       "      <td>2025-06-06T00:55:11.604116</td>\n",
+       "      <td>Court provides smart contracts for arbitrating...</td>\n",
+       "      <td>SUCCESS</td>\n",
+       "      <td>Application-Specific &amp; Niche Tools</td>\n",
+       "      <td>Application-Specific &amp; Niche Tools</td>\n",
+       "      <td>DeFi Security &amp; Monitoring</td>\n",
+       "      <td>Application-Specific &amp; Niche Tools</td>\n",
+       "      <td>Governance &amp; DAO Tooling</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>j9aT6b4e9dCsCbJ42JXen90EHik4VhyLFvX2RjeiJGM=</td>\n",
+       "      <td>Fs/BFdYMfeuzzzWPOX3dtOA6Z4AOJsB2eO2JIoZEzUo=</td>\n",
+       "      <td>ensdomains</td>\n",
+       "      <td>ENS</td>\n",
+       "      <td>ensdomains</td>\n",
+       "      <td>op-resolver</td>\n",
+       "      <td>2022-11-03 11:14:36.000 UTC</td>\n",
+       "      <td>2025-05-20 03:21:33.000 UTC</td>\n",
+       "      <td>17</td>\n",
+       "      <td>6</td>\n",
+       "      <td>...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Interoperability &amp; Cross-chain</td>\n",
+       "      <td>2025-06-06T00:55:03.917944</td>\n",
+       "      <td>The Optimism Resolver project facilitates stor...</td>\n",
+       "      <td>SUCCESS</td>\n",
+       "      <td>Infrastructure &amp; Node Operations</td>\n",
+       "      <td>Interoperability &amp; Cross-chain</td>\n",
+       "      <td>Interoperability &amp; Cross-chain</td>\n",
+       "      <td>User Interface &amp; Integration SDKs</td>\n",
+       "      <td>Governance &amp; DAO Tooling</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5234</th>\n",
+       "      <td>AcuAtRmOCfY1rQN0rAx8iP5pdbgveBahZYSWK2leQq4=</td>\n",
+       "      <td>AmxsQKHnsygqA+a7WJawinHjVclh84R+edks3EL9jiM=</td>\n",
+       "      <td>fuellabs</td>\n",
+       "      <td>Fuel Network</td>\n",
+       "      <td>fuellabs</td>\n",
+       "      <td>fuels-rs</td>\n",
+       "      <td>2021-10-31 22:33:54.000 UTC</td>\n",
+       "      <td>2025-06-03 17:34:29.000 UTC</td>\n",
+       "      <td>43747</td>\n",
+       "      <td>1355</td>\n",
+       "      <td>...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Development Frameworks</td>\n",
+       "      <td>2025-06-05T14:24:14.479181</td>\n",
+       "      <td>The fuels-rs project provides a Rust SDK for t...</td>\n",
+       "      <td>SUCCESS</td>\n",
+       "      <td>Core Protocol Interfaces</td>\n",
+       "      <td>Development Frameworks</td>\n",
+       "      <td>Development Frameworks</td>\n",
+       "      <td>Development Frameworks</td>\n",
+       "      <td>Development Frameworks</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5235</th>\n",
+       "      <td>JfvNeHojsqThZKXGfbrSSW4JIf2db88eIku67txzj9w=</td>\n",
+       "      <td>vD6QgU2nKpWiutcCnblDJkVHtDkLDH6oyITV+xpe3+g=</td>\n",
+       "      <td>go-ethereum</td>\n",
+       "      <td>geth</td>\n",
+       "      <td>ethereum</td>\n",
+       "      <td>go-ethereum</td>\n",
+       "      <td>2013-12-26 13:05:46.000 UTC</td>\n",
+       "      <td>2025-06-03 16:54:54.000 UTC</td>\n",
+       "      <td>49065</td>\n",
+       "      <td>20888</td>\n",
+       "      <td>...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Infrastructure &amp; Node Operations</td>\n",
+       "      <td>2025-06-05T14:24:08.096520</td>\n",
+       "      <td>Go Ethereum (geth) is a Golang implementation ...</td>\n",
+       "      <td>SUCCESS</td>\n",
+       "      <td>Infrastructure &amp; Node Operations</td>\n",
+       "      <td>Infrastructure &amp; Node Operations</td>\n",
+       "      <td>Infrastructure &amp; Node Operations</td>\n",
+       "      <td>Infrastructure &amp; Node Operations</td>\n",
+       "      <td>Infrastructure &amp; Node Operations</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5236</th>\n",
+       "      <td>imBvQgAogfFYL0+hque3sUxe+dN53nsDQFoz1q1jgDA=</td>\n",
+       "      <td>AmxsQKHnsygqA+a7WJawinHjVclh84R+edks3EL9jiM=</td>\n",
+       "      <td>fuellabs</td>\n",
+       "      <td>Fuel Network</td>\n",
+       "      <td>fuellabs</td>\n",
+       "      <td>fuel-core</td>\n",
+       "      <td>2020-08-27 21:12:14.000 UTC</td>\n",
+       "      <td>2025-06-03 17:34:30.000 UTC</td>\n",
+       "      <td>57637</td>\n",
+       "      <td>2852</td>\n",
+       "      <td>...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Infrastructure &amp; Node Operations</td>\n",
+       "      <td>2025-06-05T14:24:01.176979</td>\n",
+       "      <td>The Fuel client implements a Fuel node, provid...</td>\n",
+       "      <td>SUCCESS</td>\n",
+       "      <td>Infrastructure &amp; Node Operations</td>\n",
+       "      <td>Infrastructure &amp; Node Operations</td>\n",
+       "      <td>Infrastructure &amp; Node Operations</td>\n",
+       "      <td>Infrastructure &amp; Node Operations</td>\n",
+       "      <td>Infrastructure &amp; Node Operations</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5237</th>\n",
+       "      <td>XK2KsRrMXU8N9WUAXb0V+X2pZWgx9H2UCtaJ6IONUC4=</td>\n",
+       "      <td>AmxsQKHnsygqA+a7WJawinHjVclh84R+edks3EL9jiM=</td>\n",
+       "      <td>fuellabs</td>\n",
+       "      <td>Fuel Network</td>\n",
+       "      <td>fuellabs</td>\n",
+       "      <td>sway</td>\n",
+       "      <td>2021-01-19 20:54:33.000 UTC</td>\n",
+       "      <td>2025-06-03 17:34:31.000 UTC</td>\n",
+       "      <td>62255</td>\n",
+       "      <td>5405</td>\n",
+       "      <td>...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Language &amp; Compilation Tools</td>\n",
+       "      <td>2025-06-05T14:23:54.181337</td>\n",
+       "      <td>Sway is a programming language designed for th...</td>\n",
+       "      <td>SUCCESS</td>\n",
+       "      <td>Language &amp; Compilation Tools</td>\n",
+       "      <td>Language &amp; Compilation Tools</td>\n",
+       "      <td>Language &amp; Compilation Tools</td>\n",
+       "      <td>Language &amp; Compilation Tools</td>\n",
+       "      <td>Language &amp; Compilation Tools</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5238</th>\n",
+       "      <td>ACDSfw399At2CyBKEzgNCwOZ3zvC990eWZjGw+Z8isA=</td>\n",
+       "      <td>cJt1yXO/geeLxyt++Pe5iU+kUyklaoGot3rHqrDNk1o=</td>\n",
+       "      <td>base-org</td>\n",
+       "      <td>Base</td>\n",
+       "      <td>base-org</td>\n",
+       "      <td>node</td>\n",
+       "      <td>2023-02-01 13:55:02.000 UTC</td>\n",
+       "      <td>2025-02-10 01:22:12.000 UTC</td>\n",
+       "      <td>68568</td>\n",
+       "      <td>2635</td>\n",
+       "      <td>...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>Infrastructure &amp; Node Operations</td>\n",
+       "      <td>2025-06-05T14:23:47.813647</td>\n",
+       "      <td>The Base Node project provides Docker configur...</td>\n",
+       "      <td>SUCCESS</td>\n",
+       "      <td>Infrastructure &amp; Node Operations</td>\n",
+       "      <td>Infrastructure &amp; Node Operations</td>\n",
+       "      <td>Infrastructure &amp; Node Operations</td>\n",
+       "      <td>Infrastructure &amp; Node Operations</td>\n",
+       "      <td>Infrastructure &amp; Node Operations</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5239 rows × 22 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                  repo_artifact_id  \\\n",
+       "0     jXXy/fnXRva/c1jf/Weav9O3pWDHf/lVArjj0/oteUM=   \n",
+       "1     Ymt6ZmVh75JL7ml3IM9hU32qFd+GB84kXijLttRFS+w=   \n",
+       "2     sTL/I78T3P6uyVN+En480uSHiTXT7UdHPmKQlvWxCkc=   \n",
+       "3     9C23r6x0hqtbR/lB/1nYpc5KVgCtm4ga+lOJa4gd2cY=   \n",
+       "4     j9aT6b4e9dCsCbJ42JXen90EHik4VhyLFvX2RjeiJGM=   \n",
+       "...                                            ...   \n",
+       "5234  AcuAtRmOCfY1rQN0rAx8iP5pdbgveBahZYSWK2leQq4=   \n",
+       "5235  JfvNeHojsqThZKXGfbrSSW4JIf2db88eIku67txzj9w=   \n",
+       "5236  imBvQgAogfFYL0+hque3sUxe+dN53nsDQFoz1q1jgDA=   \n",
+       "5237  XK2KsRrMXU8N9WUAXb0V+X2pZWgx9H2UCtaJ6IONUC4=   \n",
+       "5238  ACDSfw399At2CyBKEzgNCwOZ3zvC990eWZjGw+Z8isA=   \n",
+       "\n",
+       "                                        project_id  \\\n",
+       "0     KLkMfahLmIEtzAbihkJ4U9p0e/3zWn0iN6xBrwN++lU=   \n",
+       "1     4JnmAZ2ikJpcjoRMX+2ZLxAQ10don2FQd6yPyDaBZ20=   \n",
+       "2     4JnmAZ2ikJpcjoRMX+2ZLxAQ10don2FQd6yPyDaBZ20=   \n",
+       "3     Fs/BFdYMfeuzzzWPOX3dtOA6Z4AOJsB2eO2JIoZEzUo=   \n",
+       "4     Fs/BFdYMfeuzzzWPOX3dtOA6Z4AOJsB2eO2JIoZEzUo=   \n",
+       "...                                            ...   \n",
+       "5234  AmxsQKHnsygqA+a7WJawinHjVclh84R+edks3EL9jiM=   \n",
+       "5235  vD6QgU2nKpWiutcCnblDJkVHtDkLDH6oyITV+xpe3+g=   \n",
+       "5236  AmxsQKHnsygqA+a7WJawinHjVclh84R+edks3EL9jiM=   \n",
+       "5237  AmxsQKHnsygqA+a7WJawinHjVclh84R+edks3EL9jiM=   \n",
+       "5238  cJt1yXO/geeLxyt++Pe5iU+kUyklaoGot3rHqrDNk1o=   \n",
+       "\n",
+       "                      project_name                  display_name  \\\n",
+       "0     ethereum-attestation-service  Ethereum Attestation Service   \n",
+       "1                       erigontech                    erigontech   \n",
+       "2                       erigontech                    erigontech   \n",
+       "3                       ensdomains                           ENS   \n",
+       "4                       ensdomains                           ENS   \n",
+       "...                            ...                           ...   \n",
+       "5234                      fuellabs                  Fuel Network   \n",
+       "5235                   go-ethereum                          geth   \n",
+       "5236                      fuellabs                  Fuel Network   \n",
+       "5237                      fuellabs                  Fuel Network   \n",
+       "5238                      base-org                          Base   \n",
+       "\n",
+       "           repo_artifact_namespace repo_artifact_name  \\\n",
+       "0     ethereum-attestation-service      eas-docs-site   \n",
+       "1                       erigontech           gmp-wasm   \n",
+       "2                       erigontech        diagnostics   \n",
+       "3                       ensdomains              court   \n",
+       "4                       ensdomains        op-resolver   \n",
+       "...                            ...                ...   \n",
+       "5234                      fuellabs           fuels-rs   \n",
+       "5235                      ethereum        go-ethereum   \n",
+       "5236                      fuellabs          fuel-core   \n",
+       "5237                      fuellabs               sway   \n",
+       "5238                      base-org               node   \n",
+       "\n",
+       "                       created_at                   updated_at  star_count  \\\n",
+       "0     2022-11-09 19:39:56.000 UTC  2025-06-02 15:51:08.000 UTC          17   \n",
+       "1     2020-12-16 08:27:02.000 UTC  2025-03-24 16:40:59.000 UTC          17   \n",
+       "2     2023-02-22 11:05:42.000 UTC  2025-04-25 07:42:52.000 UTC          17   \n",
+       "3     2018-05-02 19:41:02.000 UTC  2025-05-20 03:41:25.000 UTC          17   \n",
+       "4     2022-11-03 11:14:36.000 UTC  2025-05-20 03:21:33.000 UTC          17   \n",
+       "...                           ...                          ...         ...   \n",
+       "5234  2021-10-31 22:33:54.000 UTC  2025-06-03 17:34:29.000 UTC       43747   \n",
+       "5235  2013-12-26 13:05:46.000 UTC  2025-06-03 16:54:54.000 UTC       49065   \n",
+       "5236  2020-08-27 21:12:14.000 UTC  2025-06-03 17:34:30.000 UTC       57637   \n",
+       "5237  2021-01-19 20:54:33.000 UTC  2025-06-03 17:34:31.000 UTC       62255   \n",
+       "5238  2023-02-01 13:55:02.000 UTC  2025-02-10 01:22:12.000 UTC       68568   \n",
+       "\n",
+       "      fork_count  ...  is_actively_maintained  \\\n",
+       "0             39  ...                    True   \n",
+       "1              4  ...                    True   \n",
+       "2             21  ...                    True   \n",
+       "3              7  ...                    True   \n",
+       "4              6  ...                    True   \n",
+       "...          ...  ...                     ...   \n",
+       "5234        1355  ...                    True   \n",
+       "5235       20888  ...                    True   \n",
+       "5236        2852  ...                    True   \n",
+       "5237        5405  ...                    True   \n",
+       "5238        2635  ...                    True   \n",
+       "\n",
+       "                    final_recommendation        processing_timestamp  \\\n",
+       "0             Developer Experience Tools  2025-06-06T00:55:35.737447   \n",
+       "1              Cryptography & Primitives  2025-06-06T00:55:27.645719   \n",
+       "2       Infrastructure & Node Operations  2025-06-06T00:55:20.010674   \n",
+       "3     Application-Specific & Niche Tools  2025-06-06T00:55:11.604116   \n",
+       "4         Interoperability & Cross-chain  2025-06-06T00:55:03.917944   \n",
+       "...                                  ...                         ...   \n",
+       "5234              Development Frameworks  2025-06-05T14:24:14.479181   \n",
+       "5235    Infrastructure & Node Operations  2025-06-05T14:24:08.096520   \n",
+       "5236    Infrastructure & Node Operations  2025-06-05T14:24:01.176979   \n",
+       "5237        Language & Compilation Tools  2025-06-05T14:23:54.181337   \n",
+       "5238    Infrastructure & Node Operations  2025-06-05T14:23:47.813647   \n",
+       "\n",
+       "                                                summary readme_status  \\\n",
+       "0     The project provides documentation for the Eth...       SUCCESS   \n",
+       "1     The GNU MP Library provides arbitrary precisio...       SUCCESS   \n",
+       "2     The Erigon Diagnostics System is a web applica...       SUCCESS   \n",
+       "3     Court provides smart contracts for arbitrating...       SUCCESS   \n",
+       "4     The Optimism Resolver project facilitates stor...       SUCCESS   \n",
+       "...                                                 ...           ...   \n",
+       "5234  The fuels-rs project provides a Rust SDK for t...       SUCCESS   \n",
+       "5235  Go Ethereum (geth) is a Golang implementation ...       SUCCESS   \n",
+       "5236  The Fuel client implements a Fuel node, provid...       SUCCESS   \n",
+       "5237  Sway is a programming language designed for th...       SUCCESS   \n",
+       "5238  The Base Node project provides Docker configur...       SUCCESS   \n",
+       "\n",
+       "                      protocol_architect                   ecosystem_analyst  \\\n",
+       "0             Developer Experience Tools          Developer Experience Tools   \n",
+       "1              Cryptography & Primitives           Cryptography & Primitives   \n",
+       "2       Infrastructure & Node Operations    Infrastructure & Node Operations   \n",
+       "3     Application-Specific & Niche Tools  Application-Specific & Niche Tools   \n",
+       "4       Infrastructure & Node Operations      Interoperability & Cross-chain   \n",
+       "...                                  ...                                 ...   \n",
+       "5234            Core Protocol Interfaces              Development Frameworks   \n",
+       "5235    Infrastructure & Node Operations    Infrastructure & Node Operations   \n",
+       "5236    Infrastructure & Node Operations    Infrastructure & Node Operations   \n",
+       "5237        Language & Compilation Tools        Language & Compilation Tools   \n",
+       "5238    Infrastructure & Node Operations    Infrastructure & Node Operations   \n",
+       "\n",
+       "                   security_researcher            user_experience_advocate  \\\n",
+       "0           Developer Experience Tools          Developer Experience Tools   \n",
+       "1            Cryptography & Primitives           Cryptography & Primitives   \n",
+       "2           DeFi Security & Monitoring    Infrastructure & Node Operations   \n",
+       "3           DeFi Security & Monitoring  Application-Specific & Niche Tools   \n",
+       "4       Interoperability & Cross-chain   User Interface & Integration SDKs   \n",
+       "...                                ...                                 ...   \n",
+       "5234            Development Frameworks              Development Frameworks   \n",
+       "5235  Infrastructure & Node Operations    Infrastructure & Node Operations   \n",
+       "5236  Infrastructure & Node Operations    Infrastructure & Node Operations   \n",
+       "5237      Language & Compilation Tools        Language & Compilation Tools   \n",
+       "5238  Infrastructure & Node Operations    Infrastructure & Node Operations   \n",
+       "\n",
+       "                   governance_specialist  \n",
+       "0     Application-Specific & Niche Tools  \n",
+       "1              Cryptography & Primitives  \n",
+       "2       Infrastructure & Node Operations  \n",
+       "3               Governance & DAO Tooling  \n",
+       "4               Governance & DAO Tooling  \n",
+       "...                                  ...  \n",
+       "5234              Development Frameworks  \n",
+       "5235    Infrastructure & Node Operations  \n",
+       "5236    Infrastructure & Node Operations  \n",
+       "5237        Language & Compilation Tools  \n",
+       "5238    Infrastructure & Node Operations  \n",
+       "\n",
+       "[5239 rows x 22 columns]"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.read_parquet('output/ethereum_repos_unified.parquet')\n",
+    "df[\"categorizations_list\"] = df[\"categorizations_json\"].apply(json.loads)\n",
+    "\n",
+    "def persona_to_category_map(cats_list):\n",
+    "    return { d[\"persona_name\"]: d[\"category\"] for d in cats_list }\n",
+    "df_persona_map = df[\"categorizations_list\"].apply(persona_to_category_map)\n",
+    "df_persona_cols = pd.json_normalize(df_persona_map)\n",
+    "\n",
+    "df = df.join(df_persona_cols)\n",
+    "df = df.drop(columns=[\"categorizations_list\", \"categorizations_json\", \"readme_md\"])\n",
+    "#df.to_csv('categorizations.csv')\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "368d46ba-cedc-455d-a246-925b6c996090",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/experiments/ethereum-repo-clusters/README.md b/experiments/ethereum-repo-clusters/README.md
new file mode 100644
index 00000000..5f5db141
--- /dev/null
+++ b/experiments/ethereum-repo-clusters/README.md
@@ -0,0 +1,402 @@
+# Ethereum Repo Clusters
+
+A Python package for automatically clustering Ethereum development tools and libraries based on their README content using AI-driven analysis and multiple personas.
+
+## Overview
+
+This project implements a pipeline to:
+1.  Fetch repository data from the OSO (Open Source Observer) database.
+2.  Retrieve corresponding README files from GitHub.
+3.  Generate concise project summaries using Google's Gemini AI.
+4.  Employ multiple configurable AI personas to categorize each project based on its summary and metadata.
+5.  Consolidate these categorizations, using a star-count weighted approach for projects with multiple repositories, to produce a final recommended category.
+
+The entire process is managed via a Command Line Interface (CLI).
+
+## Features
+
+-   Fetches comprehensive repository data via OSO, including fork status and activity tracking.
+-   Retrieves and processes README.md files from GitHub with robust error handling.
+-   Utilizes Google's Gemini AI for intelligent summary generation.
+-   Employs a multi-persona approach for nuanced project categorization.
+-   Supports an arbitrary number of configurable AI personas.
+-   Calculates final project recommendations using star-count weighted consolidation.
+-   Offers both modular pipeline and unified processing approaches.
+-   Provides detailed tracking of repository status (active/inactive, fork/non-fork).
+-   Handles empty or error READMEs gracefully with "UNCATEGORIZED" status.
+-   Includes timestamps for all categorization operations.
+-   Test mode for quick runs on a subset of data.
+-   Outputs data at various stages in Parquet and CSV formats (with README text removed from CSV for readability).
+-   Supports easy resumption of processing and addition of new repositories.
+-   Features comprehensive progress bars at multiple levels for better visibility into processing status.
+-   **Checkpoint System**: Automatically saves progress after each step, allowing for seamless recovery from interruptions.
+-   **Incremental Saving**: Saves results after processing each repository, ensuring no work is lost if the process is interrupted.
+-   **Resume Capability**: Automatically detects partially processed repositories and continues from where it left off.
+
+## Prerequisites
+
+-   Python 3.10+
+-   Access to OSO, GitHub, and Google Gemini APIs.
+
+## Installation
+
+1.  **Clone the repository:**
+    ```bash
+    git clone <your-repository-url>
+    cd ethereum-repo-clusters
+    ```
+
+2.  **Set up a virtual environment (recommended):**
+    ```bash
+    python -m venv venv
+    source venv/bin/activate  # On Windows use `venv\Scripts\activate`
+    ```
+
+3.  **Install dependencies:**
+    ```bash
+    pip install -r requirements.txt
+    ```
+
+4.  **Install the package in editable mode (optional, for development):**
+    ```bash
+    pip install -e .
+    ```
+
+5.  **Create a `.env` file** in the project root directory (`ethereum-repo-clusters/`) and add your API keys:
+    ```env
+    OSO_API_KEY="your_oso_api_key"
+    GITHUB_TOKEN="your_github_token" # A GitHub Personal Access Token with repo access
+    GEMINI_API_KEY="your_gemini_api_key"
+    ```
+    These keys are loaded via `ethereum-repo-clusters/config/settings.py`.
+
+## Configuration
+
+The project uses a combination of a JSON configuration file and Python modules for settings:
+
+-   **`pipeline_config.json`**:
+    -   Located at the project root.
+    -   Controls operational settings like `output_dir`, `test_mode`, `test_mode_limit`, AI model name (`gemini_model`), and batch sizes for AI processing.
+    -   If this file is missing, it will be automatically created with default values on the first run.
+    -   Values in this file override defaults sourced from Python modules.
+
+-   **AI Personas (`ethereum-repo-clusters/config/prompts/personas.py`):**
+    -   Define the different AI personas used for categorization.
+    -   Each persona is a dictionary with `name`, `title`, `description`, and a `prompt` template.
+    -   Modify this Python list directly to add, remove, or change personas.
+
+-   **Categories (`ethereum-repo-clusters/config/prompts/categories.py`):**
+    -   Defines the list of possible categories projects can be assigned to.
+    -   Includes `CATEGORIES` (list of dicts with `category` and `description`) and `CATEGORY_NAMES` (a simple list of category names).
+    -   Edit this file to update the categorization taxonomy.
+
+-   **Prompt Templates (`ethereum-repo-clusters/config/prompts/summary_prompts.py`):**
+    -   Contains `SUMMARY_PROMPT` (for generating project summaries) and `TAGS_PROMPT` (for an auxiliary tag generation, currently not central to categorization).
+    -   These are used by the `AIService`.
+
+-   **Core Settings (`ethereum-repo-clusters/config/settings.py`):**
+    -   Loads API keys from the `.env` file.
+    -   Defines default values for `GEMINI_MODEL` and `OUTPUT_DIR` if not specified in `pipeline_config.json`.
+
+## Usage (CLI)
+
+The project is operated via the command line using `python -m ethereum-repo-clusters`.
+
+**General Command Structure:**
+```bash
+python -m ethereum-repo-clusters [GLOBAL_OPTIONS] COMMAND [COMMAND_OPTIONS]
+```
+
+**Global Options:**
+-   `--test-mode`: Runs the specified command(s) in test mode, processing a limited number of repositories (defined by `test_mode_limit` in `pipeline_config.json`, sorted by stars).
+
+**Main Commands:**
+
+-   **`fetch_repos`**: Fetches repository data from OSO and READMEs from GitHub.
+    ```bash
+    python -m ethereum-repo-clusters fetch_repos
+    ```
+    -   `--force-refresh`: Wipes existing raw repository data and re-fetches.
+    -   `--fetch-new-only`: Only fetches repositories that don't exist in current data.
+
+-   **`generate_summaries`**: Generates AI summaries for fetched repositories.
+    ```bash
+    python -m ethereum-repo-clusters generate_summaries
+    ```
+    -   `--force-refresh`: Wipes existing summaries and regenerates them.
+    -   `--new-only`: Only generates summaries for repositories that don't have summaries yet.
+
+-   **`categorize`**: Categorizes projects using all defined AI personas.
+    ```bash
+    python -m ethereum-repo-clusters categorize
+    ```
+    -   `--force-refresh`: Wipes existing categorizations and re-runs.
+    -   `--persona <persona_name>`: Processes only the specified persona. Can be combined with `--force-refresh`. Example:
+        ```bash
+        python -m ethereum-repo-clusters categorize --persona keyword_spotter --force-refresh
+        ```
+    -   `--new-only`: Only categorizes repositories that don't have categories yet.
+
+-   **`consolidate`**: Consolidates categorizations from all personas and generates final project recommendations.
+    ```bash
+    python -m ethereum-repo-clusters consolidate
+    ```
+    *(This step does not typically require a force-refresh as it always processes the latest categorized data.)*
+
+**Persona Management (Informational):**
+The CLI includes commands related to personas, but due to refactoring, persona definitions are now managed directly in `ethereum-repo-clusters/config/prompts/personas.py`. These CLI commands are informational:
+
+-   `python -m ethereum-repo-clusters personas list`: Lists personas currently defined in `personas.py`.
+-   `python -m ethereum-repo-clusters personas add ...`: Provides instructions on how to add a persona by editing `personas.py`.
+-   `python -m ethereum-repo-clusters personas remove <name>`: Provides instructions on how to remove a persona by editing `personas.py`.
+
+**Example Full Run in Test Mode with Full Refresh:**
+```bash
+# Legacy pipeline approach
+python -m ethereum-repo-clusters --test-mode run_all --force-refresh-all
+
+# New unified processor approach (recommended)
+python -m ethereum-repo-clusters --test-mode run_all --force-refresh-all --use-unified
+```
+
+## Workflow
+
+### Legacy Pipeline (Step-by-Step)
+
+1.  **Fetch Data (`fetch_repos`):**
+    -   Repository metadata is fetched from OSO.
+    -   README.md content is fetched from GitHub for these repositories.
+    -   Output: `output/devtooling_raw.parquet`
+
+2.  **Generate Summaries (`generate_summaries`):**
+    -   READMEs are processed by Gemini AI to create concise summaries.
+    -   Output: `output/devtooling_summarized.parquet`
+
+3.  **Categorize by Persona (`categorize`):**
+    -   Each project summary (with metadata) is evaluated by every defined AI persona.
+    -   Each persona assigns a category based on its specific prompt and the global category list.
+    -   Output: Individual Parquet files per persona in `output/categorized/` (e.g., `output/categorized/keyword_spotter.parquet`).
+
+4.  **Consolidate Recommendations (`consolidate`):**
+    -   Categorizations from all personas are merged.
+    -   For each project:
+        -   If it's a single-repository project, the recommendation is based on a star-weighted aggregation of persona assignments for that repo.
+        -   If it's a multi-repository project, the recommendation is determined by a star-count weighted aggregation of all persona assignments across all its repositories. The category with the highest total star weight wins.
+    -   Output: `output/devtooling_full.parquet` and `output/devtooling_consolidated.csv`.
+
+### New Unified Processor (Recommended)
+
+The new unified processor combines all steps into a single efficient pipeline:
+
+1.  **Process Repositories (`process_unified`):**
+    -   Repository metadata is fetched from OSO, including fork status and activity tracking.
+    -   README.md content is fetched from GitHub with robust error handling.
+    -   For each repository with a valid README:
+        -   A summary is generated immediately.
+        -   All personas categorize the repository in sequence.
+        -   Results are stored with timestamps for each operation.
+    -   For repositories with empty or error READMEs:
+        -   Status is tracked as "EMPTY" or "ERROR".
+        -   All categorizations are marked as "UNCATEGORIZED".
+    -   A final recommendation is determined based on the most common category across personas.
+    -   Output: `output/ethereum_repos_unified.parquet` and `output/ethereum_repos_unified.csv`.
+
+The unified processor offers several advantages:
+-   Single pass through repositories (more efficient)
+-   Better error handling and status tracking
+-   Easier to resume processing or add new repositories
+-   Comprehensive data structure with all information in one place
+-   Timestamps for all operations for better traceability
+-   Detailed progress bars for tracking processing status at multiple levels
+-   CSV output with README text removed for improved readability
+-   Checkpoint system that saves progress after each step
+-   Incremental saving that preserves work even if interrupted
+-   Automatic resume capability that continues from where it left off
+
+## Output Files
+
+All output data is stored in the directory specified by `output_dir` in `pipeline_config.json` (default is `output/`).
+
+### Legacy Pipeline Output
+
+-   **`devtooling_raw.parquet`**: Raw data fetched from OSO, augmented with GitHub README content.
+-   **`devtooling_summarized.parquet`**: Repositories with their AI-generated summaries.
+-   **`categorized/<persona_name>.parquet`**: Dataframe for each persona, containing the original summary data plus that persona's assigned category and reason.
+-   **`devtooling_full.parquet`**: The final consolidated dataset, with one row per project, including the overall recommendation, total stars, repo count, sample summary, and individual persona category modes.
+-   **`devtooling_consolidated.csv`**: A CSV version of the final consolidated data for easier viewing.
+
+### Unified Processor Output
+
+-   **`ethereum_repos_unified.parquet`**: Comprehensive dataset containing all repositories with their metadata, summaries, and categorizations in a single structure.
+-   **`ethereum_repos_unified.csv`**: A CSV version of the unified data for easier viewing, with README text removed and long text fields truncated for readability.
+-   **`processing_checkpoint.json`**: Checkpoint file that tracks processing progress, allowing for seamless recovery from interruptions.
+
+### Unified Data Structure
+
+The unified processor creates a comprehensive data structure with the following key fields:
+
+```json
+{
+  "repo_artifact_id": "...",
+  "project_id": "...",
+  "repo_artifact_namespace": "...",
+  "repo_artifact_name": "...",
+  "is_fork": true/false,
+  "is_actively_maintained": true/false,
+  "last_updated": "2024-12-01",
+  "star_count": 100,
+  "readme_status": "SUCCESS/EMPTY/ERROR",
+  "summary": "...",
+  "categorizations": [
+    {
+      "persona_name": "keyword_spotter",
+      "category": "Developer Tools",
+      "reason": "Contains keywords like 'CLI', 'build tool'...",
+      "timestamp": "2025-01-05T09:15:00Z"
+    },
+    {
+      "persona_name": "senior_strategist",
+      "category": "Infrastructure",
+      "reason": "Mature project with strong adoption...",
+      "timestamp": "2025-01-05T09:15:01Z"
+    },
+    {
+      "persona_name": "workflow_wizard",
+      "category": "Developer Tools",
+      "reason": "Streamlines development workflow...",
+      "timestamp": "2025-01-05T09:15:02Z"
+    }
+  ],
+  "final_recommendation": "Developer Tools",
+  "processing_timestamp": "2025-01-05T09:15:02Z"
+}
+```
+
+This structure makes it easy to:
+- Track which repositories have been processed
+- Identify repositories with errors or empty READMEs
+- See the categorization from each persona with timestamps
+- Filter repositories by fork status or activity
+- Resume processing from where you left off
+
+## Development Notes
+- The project uses `tqdm` for progress bars during long operations, with detailed progress tracking at multiple levels:
+  - Overall batch processing
+  - Repository processing within each batch
+  - README fetching for each repository
+  - Categorization with each persona
+- `DataManager` class in `ethereum-repo-clusters/pipeline/data_manager.py` handles all data persistence (reading/writing Parquet files).
+- `AIService` in `ethereum-repo-clusters/processing/ai_service.py` abstracts interactions with the Gemini API.
+- `UnifiedProcessor` in `ethereum-repo-clusters/pipeline/unified_processor.py` provides the new streamlined processing approach.
+- The CLI in `ethereum-repo-clusters/cli/main_cli.py` supports both legacy and unified processing approaches.
+- Output files are saved to the local `output/` directory in the current repository.
+
+## New CLI Commands
+
+### Unified Processing
+
+```bash
+# Process repositories with the unified processor
+python -m ethereum-repo-clusters process_unified [OPTIONS]
+
+# Options:
+#   --force-refresh      Force refresh all data, ignoring existing.
+#   --include-forks      Include forked repositories in processing.
+#   --include-inactive   Include repositories not updated in the last year.
+#   --limit INTEGER      Limit the number of repositories to process.
+```
+
+### Run All with Unified Processor
+
+```bash
+# Run the entire pipeline using the unified processor
+python -m ethereum-repo-clusters run_all --use-unified [OPTIONS]
+
+# Additional options with --use-unified:
+#   --include-forks      Include forked repositories in processing.
+#   --include-inactive   Include repositories not updated in the last year.
+```
+
+## Adding New Repositories
+
+To add new repositories to the analysis:
+
+1. The unified processor automatically detects which repositories have already been processed.
+2. New repositories from OSO will be processed automatically on the next run.
+3. To add repositories manually, you can:
+   - Update the OSO query in `fetcher.py` to include additional repositories.
+   - Create a custom script that adds repositories to the unified data structure.
+
+## Error Handling
+
+The unified processor handles errors gracefully:
+
+- Empty READMEs: Marked with `readme_status="EMPTY"` and categorized as "UNCATEGORIZED".
+- Error fetching README: Marked with `readme_status="ERROR"` and categorized as "UNCATEGORIZED".
+- API errors during categorization: The specific persona's categorization is marked as "UNCATEGORIZED" with the error reason.
+
+This approach ensures that all repositories are included in the final output, even if they couldn't be fully processed.
+
+## Checkpoint System
+
+The unified processor now includes a robust checkpoint system that makes it resilient to interruptions:
+
+### How It Works
+
+1. **Incremental Saving**: Results are saved after processing each repository, not just at the end.
+2. **Checkpoint File**: A JSON file (`output/processing_checkpoint.json`) tracks:
+   - Which repositories have been fully processed
+   - Which repositories are partially processed and their current state
+   - The last repository that was successfully processed
+
+3. **Granular Progress Tracking**: The checkpoint tracks progress at multiple levels:
+   - README fetching status
+   - Summary generation status
+   - Which personas have completed categorization
+
+4. **Resume Logic**: When restarted after an interruption, the processor:
+   - Skips repositories that have been fully processed
+   - Continues from where it left off for partially processed repositories
+   - Preserves all work that was completed before the interruption
+
+5. **Space Optimization**: Once a repository is fully processed, its partial results are removed from the checkpoint file to save space.
+
+### Benefits
+
+- **No Lost Work**: Even if interrupted during a long-running process, no work is lost.
+- **API Efficiency**: Avoids redundant API calls to GitHub and Gemini, saving rate limits and costs.
+- **Time Savings**: Picks up exactly where it left off, avoiding redundant processing.
+- **Resilience**: Handles network issues, API timeouts, and other temporary failures gracefully.
+
+### Example Checkpoint Structure
+
+```json
+{
+  "last_processed_repo_id": "ethereum/solidity",
+  "processed_repos": ["openzeppelin/openzeppelin-contracts", "ethereum/solidity"],
+  "partial_results": {
+    "ipfs/kubo": {
+      "readme_fetched": true,
+      "readme_status": "SUCCESS",
+      "summary_generated": true,
+      "personas_completed": ["protocol_architect", "ecosystem_analyst"],
+      "categorizations": [
+        {
+          "persona_name": "protocol_architect",
+          "category": "Infrastructure & Node Operations",
+          "reason": "...",
+          "timestamp": "2025-06-05T13:53:30.903574"
+        },
+        {
+          "persona_name": "ecosystem_analyst",
+          "category": "Infrastructure & Node Operations",
+          "reason": "...",
+          "timestamp": "2025-06-05T13:53:32.238039"
+        }
+      ]
+    }
+  }
+}
+```
+
+This checkpoint system ensures that the processing pipeline is robust and can handle interruptions gracefully, making it suitable for processing large numbers of repositories over extended periods.
diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/__init__.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/__main__.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/__main__.py
new file mode 100644
index 00000000..4bedcbac
--- /dev/null
+++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/__main__.py
@@ -0,0 +1,9 @@
+from .cli.main_cli import cli
+
+def main():
+    # The obj={} is a way to initialize Click's context object
+    # if it's not being run directly by the `click` runner (e.g. `python -m devtooling_labels`)
+    cli(obj={})
+
+if __name__ == "__main__":
+    main()
diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/cli/__init__.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/cli/__init__.py
new file mode 100644
index 00000000..986cde36
--- /dev/null
+++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/cli/__init__.py
@@ -0,0 +1,7 @@
+# This file makes the 'cli' directory a Python package.
+
+from .main_cli import cli
+
+__all__ = [
+    "cli"
+]
diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/cli/main_cli.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/cli/main_cli.py
new file mode 100644
index 00000000..6a1a8ddd
--- /dev/null
+++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/cli/main_cli.py
@@ -0,0 +1,243 @@
+import click
+from pathlib import Path
+
+from ..config.config_manager import ConfigManager
+from ..pipeline.data_manager import DataManager
+from ..processing.ai_service import AIService
+from ..pipeline.repository_fetcher import RepositoryFetcherStep
+from ..pipeline.summary_generator import SummaryGeneratorStep
+from ..pipeline.categorizer import CategorizerStep
+from ..pipeline.consolidator import ConsolidatorStep
+from ..pipeline.unified_processor import UnifiedProcessor
+
+# Initialize ConfigManager globally or pass as context
+# For simplicity here, we'll initialize it where needed or once at the top.
+# A more robust Click app might use a context object.
+config_manager = ConfigManager() # Loads default or existing pipeline_config.json
+
+@click.group()
+@click.option('--test-mode', is_flag=True, help='Run in test mode (limits fetched repos, uses test_mode_limit from config).')
+@click.pass_context
+def cli(ctx, test_mode):
+    """DevTooling Labels CLI for processing and categorizing repositories."""
+    ctx.ensure_object(dict)
+    
+    # Update config if test_mode flag is set via CLI
+    # This overrides the value in pipeline_config.json for this run
+    if test_mode:
+        config_manager.set("test_mode", True) 
+        # No need to save if it's a per-run override. 
+        # If we want to persist it: config_manager.save_config()
+        print(f"CLI flag --test-mode is set. Running in test mode. Limit: {config_manager.get_test_mode_limit()} repos.")
+    else:
+        # If not set by CLI, respect the config file's test_mode setting
+        # Or, explicitly set to False if CLI should always override to False when flag not present
+        # config_manager.set("test_mode", False) # Uncomment if CLI flag absence means test_mode is OFF
+        pass # Current behavior: respects config file if CLI flag is absent.
+
+    # Initialize services and pass them via context if needed by multiple commands
+    # Or initialize them within each command
+    output_dir = config_manager.get_output_dir()
+    data_manager = DataManager(output_dir=output_dir, config=config_manager)
+    ai_service = AIService(config_manager=config_manager)
+    
+    ctx.obj['config_manager'] = config_manager
+    ctx.obj['data_manager'] = data_manager
+    ctx.obj['ai_service'] = ai_service
+    ctx.obj['output_dir'] = output_dir
+
+
+@cli.command("fetch_repos")
+@click.option('--force-refresh', is_flag=True, help='Force refresh repository data, ignoring existing.')
+@click.option('--fetch-new-only', is_flag=True, help='Only fetch repositories that don\'t exist in current data.')
+@click.pass_context
+def fetch_repos_command(ctx, force_refresh, fetch_new_only):
+    """Fetches repositories and their READMEs."""
+    print("Executing: Fetch Repositories")
+    data_manager = ctx.obj['data_manager']
+    # ConfigManager is already aware of test_mode from the group command
+    config_mgr = ctx.obj['config_manager'] 
+    
+    repo_fetcher_step = RepositoryFetcherStep(data_manager=data_manager, config_manager=config_mgr)
+    repo_fetcher_step.run(force_refresh=force_refresh, fetch_new_only=fetch_new_only)
+    print("Repository fetching complete.")
+
+
+@cli.command("generate_summaries")
+@click.option('--force-refresh', is_flag=True, help='Force refresh summaries, ignoring existing.')
+@click.option('--new-only', is_flag=True, help='Generate summaries only for repositories that don\'t have summaries yet.')
+@click.pass_context
+def generate_summaries_command(ctx, force_refresh, new_only):
+    """Generates summaries for the fetched repositories."""
+    print("Executing: Generate Summaries")
+    data_manager = ctx.obj['data_manager']
+    config_mgr = ctx.obj['config_manager']
+    ai_service = ctx.obj['ai_service']
+    
+    summary_generator_step = SummaryGeneratorStep(
+        data_manager=data_manager, 
+        config_manager=config_mgr, 
+        ai_service=ai_service
+    )
+    summary_generator_step.run(force_refresh=force_refresh, new_only=new_only)
+    print("Summary generation complete.")
+
+
+@cli.command("categorize")
+@click.option('--force-refresh', is_flag=True, help='Force refresh categories, ignoring existing.')
+@click.option('--persona', help='Process only the specified persona.')
+@click.option('--new-only', is_flag=True, help='Categorize only repositories that don\'t have categories yet.')
+@click.pass_context
+def categorize_command(ctx, force_refresh, persona, new_only):
+    """Categorizes projects using AI personas."""
+    print("Executing: Categorize")
+    data_manager = ctx.obj['data_manager']
+    config_mgr = ctx.obj['config_manager']
+    ai_service = ctx.obj['ai_service']
+    
+    categorizer_step = CategorizerStep(
+        data_manager=data_manager, 
+        config_manager=config_mgr, 
+        ai_service=ai_service
+    )
+    categorizer_step.run(force_refresh=force_refresh, target_persona_name=persona, new_only=new_only)
+    print("Categorization complete.")
+
+
+@cli.command("consolidate")
+@click.pass_context
+def consolidate_command(ctx):
+    """Consolidates categorizations and generates final recommendations."""
+    print("Executing: Consolidate Analysis")
+    data_manager = ctx.obj['data_manager']
+    config_mgr = ctx.obj['config_manager']
+    
+    consolidator_step = ConsolidatorStep(data_manager=data_manager, config_manager=config_mgr)
+    consolidator_step.run()
+    print("Consolidation complete.")
+
+
+@cli.command("process_unified")
+@click.option('--force-refresh', is_flag=True, help='Force refresh all data, ignoring existing.')
+@click.option('--include-forks', is_flag=True, help='Include forked repositories in processing.')
+@click.option('--include-inactive', is_flag=True, help='Include repositories not updated in the last year.')
+@click.option('--limit', type=int, help='Limit the number of repositories to process.')
+@click.pass_context
+def process_unified_command(ctx, force_refresh, include_forks, include_inactive, limit):
+    """
+    Unified processing: fetches repos, READMEs, generates summaries, and categorizes in one pass.
+    Outputs a single comprehensive dataset with all information.
+    """
+    print("Executing: Unified Processing Pipeline")
+    data_manager = ctx.obj['data_manager']
+    config_mgr = ctx.obj['config_manager']
+    ai_service = ctx.obj['ai_service']
+    
+    processor = UnifiedProcessor(
+        data_manager=data_manager,
+        config_manager=config_mgr,
+        ai_service=ai_service
+    )
+    
+    processor.run(
+        force_refresh=force_refresh,
+        include_forks=include_forks,
+        inactive_repos=include_inactive,
+        limit=limit
+    )
+    
+    print("Unified processing complete.")
+    print(f"Results saved to:")
+    print(f"  - {data_manager.unified_parquet_path} (Parquet format)")
+    print(f"  - {data_manager.unified_csv_path} (CSV format)")
+
+
+@cli.command("run_all")
+@click.option('--force-refresh-all', is_flag=True, help='Force refresh all data stages.')
+@click.option('--force-refresh-repos', is_flag=True, help='Force refresh repository data.')
+@click.option('--force-refresh-summaries', is_flag=True, help='Force refresh summaries.')
+@click.option('--force-refresh-categories', is_flag=True, help='Force refresh categories.')
+@click.option('--use-unified', is_flag=True, help='Use the new unified processor instead of the legacy pipeline.')
+@click.option('--include-forks', is_flag=True, help='Include forked repositories (only with --use-unified).')
+@click.option('--include-inactive', is_flag=True, help='Include inactive repositories (only with --use-unified).')
+@click.option('--limit', type=int, help='Limit the number of repositories to process (only with --use-unified).')
+@click.pass_context
+def run_all_command(ctx, force_refresh_all, force_refresh_repos, force_refresh_summaries, 
+                   force_refresh_categories, use_unified, include_forks, include_inactive):
+    """Runs the entire pipeline: either legacy steps or the new unified processor."""
+    
+    if use_unified:
+        print("Executing: Run All Using Unified Processor")
+        ctx.invoke(
+            process_unified_command, 
+            force_refresh=force_refresh_all, 
+            include_forks=include_forks,
+            include_inactive=include_inactive,
+            limit=limit
+        )
+    else:
+        print("Executing: Run All Pipeline Steps (Legacy)")
+        # Determine force_refresh flags for each step
+        fr_repos = force_refresh_all or force_refresh_repos
+        fr_summaries = force_refresh_all or force_refresh_summaries
+        fr_categories = force_refresh_all or force_refresh_categories
+
+        # Invoke other commands with determined force_refresh settings
+        # The --test-mode flag from the main group is implicitly handled by ConfigManager
+        ctx.invoke(fetch_repos_command, force_refresh=fr_repos)
+        ctx.invoke(generate_summaries_command, force_refresh=fr_summaries)
+        ctx.invoke(categorize_command, force_refresh=fr_categories, persona=None, new_only=False) # Process all personas
+        ctx.invoke(consolidate_command)
+    
+    print("Full pipeline execution complete.")
+
+# Commands for managing personas in config
+@cli.group("personas")
+def personas_group():
+    """Manage AI personas in the configuration."""
+    pass
+
+@personas_group.command("list")
+@click.pass_context
+def list_personas(ctx):
+    """Lists all configured personas."""
+    config_mgr = ctx.obj['config_manager']
+    personas = config_mgr.get_personas()
+    if not personas:
+        print("No personas configured.")
+        return
+    print("Configured Personas:")
+    for p in personas:
+        print(f"- Name: {p['name']}, Title: {p.get('title', 'N/A')}")
+
+@personas_group.command("add")
+@click.option('--name', required=True, help="Unique name for the persona.")
+@click.option('--title', required=True, help="Display title for the persona.")
+@click.option('--description', required=True, help="Description of the persona's focus.")
+@click.option('--prompt-template', required=True, help="Prompt template for the persona's classification task.")
+@click.pass_context
+def add_persona(ctx, name, title, description, prompt_template):
+    """Adds a new persona to the configuration."""
+    config_mgr = ctx.obj['config_manager']
+    new_persona = {
+        "name": name,
+        "title": title,
+        "description": description,
+        "prompt": prompt_template
+    }
+    # config_mgr.add_persona(new_persona) # This method was removed as personas are managed in personas.py
+    print(f"Persona management is now done by editing devtooling_labels/config/prompts/personas.py. '{name}' was not added via CLI.")
+    print("To add a persona, please edit the personas.py file directly.")
+
+@personas_group.command("remove")
+@click.argument('name')
+@click.pass_context
+def remove_persona(ctx, name):
+    """Removes a persona by name. (Note: Persona management is now via personas.py)"""
+    # config_mgr = ctx.obj['config_manager']
+    # config_mgr.remove_persona(name) # This method was removed from ConfigManager
+    print(f"Persona management is now done by editing devtooling_labels/config/prompts/personas.py. '{name}' was not removed via CLI.")
+    print("To remove a persona, please edit the personas.py file directly.")
+
+if __name__ == '__main__':
+    cli(obj={})
diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/__init__.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/config_manager.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/config_manager.py
new file mode 100644
index 00000000..a9fb5bb1
--- /dev/null
+++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/config_manager.py
@@ -0,0 +1,139 @@
+import json
+from pathlib import Path
+from typing import List, Dict, Any
+from .settings import PROJECT_ROOT, GEMINI_API_KEY, OSO_API_KEY, GITHUB_TOKEN, GEMINI_MODEL, OUTPUT_DIR
+from .prompts.summary_prompts import SUMMARY_PROMPT, TAGS_PROMPT
+
+
+
+class ConfigManager:
+    def __init__(self, config_file_name: str = "pipeline_config.json"):
+        self.config_file_path = PROJECT_ROOT / config_file_name
+        self.config_data = self._load_config()
+
+    def _load_config(self) -> Dict[str, Any]:
+        """
+        Loads configuration from a JSON file, merging it with defaults.
+        If the file doesn't exist or is invalid, creates a default one.
+        Values in the JSON file override default values.
+        """
+        default_config = self._get_default_config()
+
+        if self.config_file_path.exists():
+            with open(self.config_file_path, 'r') as f:
+                try:
+                    loaded_config = json.load(f)
+                    # Merge: loaded_config values override default_config values
+                    merged_config = {**default_config, **loaded_config}
+                    return merged_config
+                except json.JSONDecodeError:
+                    print(f"Warning: Could not decode JSON from {self.config_file_path}. Using full default config.")
+                    # If JSON is corrupt, return the full default config, don't save it over potentially good file yet.
+                    # Or, we could save default_config here if we want to overwrite corrupted file.
+                    # For now, just return defaults for this session.
+                    return default_config
+        else:
+            print(f"Config file not found at {self.config_file_path}. Creating and using default config.")
+            # Save the full default config as the new file
+            self.save_config(default_config) 
+            return default_config
+
+    def _get_default_config(self) -> Dict[str, Any]:
+        """Returns the default configuration dictionary."""
+        return {
+            "output_dir": str(OUTPUT_DIR),
+            "gemini_model": GEMINI_MODEL,
+            "summary_prompt_template": SUMMARY_PROMPT,
+            "tags_prompt_template": TAGS_PROMPT,
+            "test_mode": False,
+            "test_mode_limit": 5,
+            "batch_size_summaries": 50,
+            "batch_size_categorization": 10 # Smaller batch for categorization due to prompt complexity
+        }
+
+    def save_config(self, config_data: Dict[str, Any] = None):
+        """Saves the current configuration to the JSON file."""
+        data_to_save = config_data if config_data else self.config_data
+        with open(self.config_file_path, 'w') as f:
+            json.dump(data_to_save, f, indent=4)
+        print(f"Configuration saved to {self.config_file_path}")
+
+    def get(self, key: str, default: Any = None) -> Any:
+        """Gets a configuration value by key."""
+        return self.config_data.get(key, default)
+
+    def set(self, key: str, value: Any):
+        """Sets a configuration value and saves the config."""
+        if key in ["gemini_api_key", "oso_api_key", "github_token"]:
+            print(f"Warning: Attempted to set API key '{key}' in config file. API keys should be managed via .env file.")
+            return
+        self.config_data[key] = value
+        self.save_config()
+
+    # --- API Key Getters ---
+    def get_gemini_api_key(self) -> str:
+        """Gets the Gemini API key directly from settings (environment)."""
+        return GEMINI_API_KEY
+
+    def get_oso_api_key(self) -> str:
+        """Gets the OSO API key directly from settings (environment)."""
+        return OSO_API_KEY
+
+    def get_github_token(self) -> str:
+        """Gets the GitHub token directly from settings (environment)."""
+        return GITHUB_TOKEN
+
+    # --- Other Getters ---
+    def get_personas(self) -> List[Dict[str, str]]:
+        """Gets the list of personas directly from the personas.py module."""
+        from .prompts.personas import PERSONAS
+        return PERSONAS
+
+    # add_persona and remove_persona are removed as personas are managed in personas.py
+
+    def is_test_mode(self) -> bool:
+        """Checks if test mode is enabled."""
+        return self.get("test_mode", False)
+
+    def get_test_mode_limit(self) -> int:
+        """Gets the limit for test mode."""
+        return self.get("test_mode_limit", 5)
+
+    def get_output_dir(self) -> Path:
+        return Path(self.get("output_dir", str(OUTPUT_DIR)))
+
+    def get_batch_size_summaries(self) -> int:
+        return self.get("batch_size_summaries", 50)
+
+    def get_batch_size_categorization(self) -> int:
+        return self.get("batch_size_categorization", 10)
+
+    def get_categories(self) -> List[Dict[str, str]]:
+        """Gets the categories directly from the categories.py module."""
+        from .prompts.categories import CATEGORIES
+        return CATEGORIES
+    
+    def get_category_names(self) -> List[str]:
+        """Gets the category names directly from the categories.py module."""
+        from .prompts.categories import CATEGORY_NAMES
+        return CATEGORY_NAMES
+
+    def get_summary_prompt_template(self) -> str:
+        return self.get("summary_prompt_template", "")
+
+    def get_tags_prompt_template(self) -> str:
+        return self.get("tags_prompt_template", "")
+        
+if __name__ == "__main__":
+    # Example usage:
+    config_manager = ConfigManager()
+    print(f"Output Directory: {config_manager.get_output_dir()}")
+    print(f"Test Mode: {config_manager.is_test_mode()}")
+    # Example active print for personas:
+    print("\nPersonas (from personas.py):")
+    for p in config_manager.get_personas():
+        print(f"- {p['name']}: {p['title']}")
+    
+    print("\nCategories (from categories.py):")
+    for cat_name in config_manager.get_category_names():
+        print(f"- {cat_name}")
diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/__init__.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/__init__.py
new file mode 100644
index 00000000..c22d1787
--- /dev/null
+++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/__init__.py
@@ -0,0 +1,11 @@
+from .categories import CATEGORIES, CATEGORY_NAMES
+from .personas import PERSONAS
+from .summary_prompts import SUMMARY_PROMPT, TAGS_PROMPT
+
+__all__ = [
+    'CATEGORIES',
+    'CATEGORY_NAMES',
+    'PERSONAS',
+    'SUMMARY_PROMPT',
+    'TAGS_PROMPT',
+]
diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/categories.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/categories.py
new file mode 100644
index 00000000..bd201d31
--- /dev/null
+++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/categories.py
@@ -0,0 +1,209 @@
+CATEGORIES = [
+    # DeFi Application Categories
+    {
+        "category": "Lending & Borrowing Protocols",
+        "description": (
+            "Lending & Borrowing Protocols include implementations and SDKs for collateralized lending markets, "
+            "flash loans, interest rate models, and liquidation mechanisms. These tools handle asset management, "
+            "risk scoring, and pool accounting, enabling users to lend or borrow assets in a trust-minimized way."
+        )
+    },
+    {
+        "category": "Decentralized Exchanges (DEXs)",
+        "description": (
+            "DEXs power peer-to-peer asset swaps and liquidity provision. This includes AMM (automated market maker) "
+            "frameworks, order book DEXes, routers, aggregators, and liquidity management libraries. They also often "
+            "support advanced trading mechanisms like TWAPs, limit orders, and MEV protection."
+        )
+    },
+    {
+        "category": "Derivatives & Synthetic Assets",
+        "description": (
+            "Derivatives & Synthetic Assets frameworks implement perpetual futures, options, and collateralized synthetic "
+            "asset systems. These toolkits involve complex pricing oracles, risk engines, margin systems, and settlement layers."
+        )
+    },
+    {
+        "category": "Stablecoin Infrastructure",
+        "description": (
+            "Stablecoin Infrastructure includes minting contracts, collateralization engines, algorithmic stabilization mechanisms, "
+            "and off-chain attestation integrations. It also encompasses tools for analyzing backing ratios and peg health."
+        )
+    },
+    {
+        "category": "Oracles & Price Feeds",
+        "description": (
+            "Oracles & Price Feeds provide real-world and cross-chain data into smart contracts. This category covers push-based oracles, "
+            "pull-based on-demand queries, cryptoeconomic staking oracles, and off-chain data relayers."
+        )
+    },
+    {
+        "category": "Vaults, Yield Strategies & Aggregators",
+        "description": (
+            "These tools optimize capital across yield-bearing protocols. They include yield routers, auto-compounding vaults, and rebalancers, "
+            "as well as SDKs to model risk-return profiles and dynamically allocate capital across farms and lending markets."
+        )
+    },
+    {
+        "category": "Asset Management & Portfolio Tooling",
+        "description": (
+            "Asset Management tooling includes interfaces and libraries for building rebalancing strategies, vault-based funds, on-chain ETFs, "
+            "and automated index trackers. They often incorporate fee structures, role-based access, and compliance checks."
+        )
+    },
+    {
+        "category": "DeFi Security & Monitoring",
+        "description": (
+            "Security tools for DeFi include real-time exploit detectors, anomaly detection systems, pause mechanisms, multisig enforcers, "
+            "and post-mortem forensic tools. Monitoring dashboards and alerting frameworks fall here as well."
+        )
+    },
+    {
+        "category": "Governance & DAO Tooling",
+        "description": (
+            "Governance & DAO Tooling enables on-chain proposal management, token-weighted voting, off-chain signaling, execution queues, "
+            "and guardrails for DeFi governance systems. Includes snapshot integration, timelocks, and delegate management interfaces."
+        )
+    },
+    {
+        "category": "Liquidity Bootstrapping & Token Distribution",
+        "description": (
+            "This includes tools for liquidity mining, airdrops, vesting contracts, bonding curves, and initial token offerings. "
+            "They facilitate community-led distribution, price discovery, and progressive decentralization of DeFi protocols."
+        )
+    },
+    {
+        "category": "DeFi Analytics & Dashboards",
+        "description": (
+            "These are SDKs, APIs, and frontends for aggregating on-chain DeFi metrics—TVL, yield, volume, and user activity. "
+            "Includes data pipelines, Dune-compatible libraries, subgraphs, and event-based ETL infrastructure tailored to DeFi."
+        )
+    },
+    {
+        "category": "Cross-chain DeFi Infrastructure",
+        "description": (
+            "These tools support multi-chain liquidity routing, cross-chain yield farming, state relays, and synthetic asset issuance. "
+            "They abstract away bridging mechanics, offering seamless user and liquidity migration across ecosystems."
+        )
+    },
+    {
+        "category": "User Interface & Integration SDKs",
+        "description": (
+            "SDKs and frontend libraries for integrating DeFi functionality into wallets, dApps, and aggregators. Includes trade UIs, "
+            "Zap interfaces, gas estimators, and batch transaction helpers to improve DeFi UX."
+        )
+    },
+    {
+        "category": "Simulation & Risk Modeling",
+        "description": (
+            "Tools that simulate user positions, economic incentives, or protocol upgrades. They model protocol resilience, agent behavior, "
+            "market shocks, and contagion scenarios, often using agent-based or Monte Carlo methods for risk-aware design."
+        )
+    },
+
+    # Developer Tool Categories
+    {
+        "category": "Language & Compilation Tools",
+        "description": (
+            "Language & Compilation Tools include compilers, interpreters, language servers, "
+            "and syntax utilities for smart-contract development. They translate high-level "
+            "source code into EVM bytecode, perform static analysis, and enable features like "
+            "symbolic execution, forming the foundation for all higher-level tooling."
+        )
+    },
+    {
+        "category": "Core Protocol Interfaces",
+        "description": (
+            "Core Protocol Interfaces are libraries and SDKs that provide reusable building blocks "
+            "for blockchain developers—smart contract libraries, JSON-RPC clients, transaction builders, "
+            "wallet and key management, authorization, signature handling, and ABI encoding/decoding. "
+            "They can power the core operations of many dApps and services."
+        )
+    },
+    {
+        "category": "Development Frameworks",
+        "description": (
+            "Development Frameworks are opinionated, end-to-end toolchains that scaffold, build, "
+            "test, and deploy smart-contract projects. They bundle CLIs, IDE integrations, task "
+            "runners, local networks, hot-reloading, and plugin ecosystems to enforce conventions "
+            "and automate workflows from project setup through to frontend integration."
+        )
+    },
+    {
+        "category": "Deployment & Lifecycle Management",
+        "description": (
+            "Deployment & Lifecycle Management tools handle contract deployment, upgrades, and "
+            "on-chain migrations. They automate predictable CREATE2 strategies, proxy pattern "
+            "management, cross-network publishes, and governance hooks, while integrating safety "
+            "checks and test-suite validations to maintain contract integrity."
+        )
+    },
+    {
+        "category": "Testing & Verification Tools",
+        "description": (
+            "Testing & Verification Tools provide frameworks for unit testing, property-based fuzzing, "
+            "symbolic execution, formal verification, and coverage analysis. They integrate vulnerability "
+            "scanners, static analyzers, and coverage reporters to identify edge-case failures and ensure "
+            "on-chain correctness."
+        )
+    },
+    {
+        "category": "Developer Experience Tools",
+        "description": (
+            "Developer Experience Tools are lightweight plugins and utilities that boost productivity "
+            "and enforce code consistency. This category includes editor extensions, linters, formatters, "
+            "code generators, documentation generators, and small CLI helpers."
+        )
+    },
+    {
+        "category": "Infrastructure & Node Operations",
+        "description": (
+            "Infrastructure & Node Operations encompass tools for running, coordinating, and scaling "
+            "blockchain nodes and peer-to-peer networks. They cover RPC providers, telemetry collectors, "
+            "log aggregators, gossip-based messaging layers, peer discovery and connection management, "
+            "and automation scripts to ensure reliable network participation."
+        )
+    },
+    {
+        "category": "Data Indexing & Analytics",
+        "description": (
+            "Data Indexing & Analytics tools ingest, process, and visualize on-chain data. They provide "
+            "GraphQL and REST APIs over processed datasets, real-time event streaming, and libraries or "
+            "dashboards for analyzing blockchain metrics."
+        )
+    },
+    {
+        "category": "Interoperability & Cross-chain",
+        "description": (
+            "Interoperability & Cross-chain covers bridging frameworks, cross-chain messaging protocols, "
+            "and Superchain interoperability tooling. These libraries enable seamless asset transfers, "
+            "state proofs, and communication across multiple networks."
+        )
+    },
+    {
+        "category": "Cryptography & Primitives",
+        "description": (
+            "Cryptography & Primitives includes low-level cryptographic libraries and building blocks—"
+            "hash functions, signature schemes, Merkle trees, zero-knowledge proof primitives, and "
+            "encryption utilities—optimized for security and performance."
+        )
+    },
+    {
+        "category": "Application-Specific & Niche Tools",
+        "description": (
+            "Application-Specific & Niche Tools are libraries and SDKs tailored to very narrow use cases "
+            "(e.g., DeFi adapters, NFT marketplaces, governance dashboards). They serve specific projects "
+            "but do not have broad applicability or reusability across the ecosystem."
+        )
+    },
+    {
+        "category": "Others",
+        "description": (
+            "Others is a catch-all for repositories with limited usage or insufficient information—"
+            "empty projects, single-file utilities, or items that cannot be reasonably categorized."
+        )
+    }
+]
+
+# Create a list of category names for easy access
+CATEGORY_NAMES = [cat["category"] for cat in CATEGORIES]
diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/personas.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/personas.py
new file mode 100644
index 00000000..209515cf
--- /dev/null
+++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/personas.py
@@ -0,0 +1,110 @@
+PERSONAS = [
+    {
+        "name": "protocol_architect",
+        "title": "Protocol & Infrastructure Architect",
+        "description": (
+            "You evaluate projects based on their technical architecture, infrastructure role, "
+            "and protocol design patterns. You focus on how well the project implements DeFi primitives, "
+            "contributes to ecosystem stability, and maintains technical dependencies."
+        ),
+        "prompt": (
+            "As a Protocol & Infrastructure Architect, analyze the project's technical foundations, "
+            "infrastructure role, and protocol design.\n\n"
+            "Summary: {summary}\n"
+            "Stars: {star_count} | Forks: {fork_count}\n"
+            "Created: {created_at} | Updated: {updated_at}\n\n"
+            "Based on the technical architecture, infrastructure contribution, and protocol design, "
+            "choose one of the categories below:\n"
+            "{categories}\n\n"
+            "Respond in JSON:\n"
+            "{{\n"
+            '  "assigned_tag": "category name",\n'
+            '  "reason": "analysis of protocol architecture, infrastructure role, technical dependencies, and ecosystem stability"\n'
+            "}}"
+        ),
+    },
+    {
+        "name": "ecosystem_analyst",
+        "title": "Ecosystem Growth Analyst",
+        "description": (
+            "You assess projects based on their potential to grow the Ethereum DeFi ecosystem, "
+            "their user adoption metrics, and their contribution to composability and innovation."
+        ),
+        "prompt": (
+            "As an Ecosystem Growth Analyst, evaluate the project's impact on DeFi ecosystem growth.\n\n"
+            "Summary: {summary}\n"
+            "Stars: {star_count} | Forks: {fork_count}\n"
+            "Created: {created_at} | Updated: {updated_at}\n\n"
+            "Select the category that best represents its ecosystem role:\n"
+            "{categories}\n\n"
+            "Respond in JSON:\n"
+            "{{\n"
+            '  "assigned_tag": "category name",\n'
+            '  "reason": "analysis of ecosystem impact, adoption potential, and composability"\n'
+            "}}"
+        ),
+    },
+    {
+        "name": "security_researcher",
+        "title": "Security & Risk Researcher",
+        "description": (
+            "You focus on security practices, risk management approaches, and the project's "
+            "contribution to making DeFi safer and more resilient."
+        ),
+        "prompt": (
+            "As a Security & Risk Researcher, assess the project's security posture and risk management.\n\n"
+            "Summary: {summary}\n"
+            "Stars: {star_count} | Forks: {fork_count}\n"
+            "Created: {created_at} | Updated: {updated_at}\n\n"
+            "Choose the category that best reflects its security and risk management approach:\n"
+            "{categories}\n\n"
+            "Respond in JSON:\n"
+            "{{\n"
+            '  "assigned_tag": "category name",\n'
+            '  "reason": "analysis of security practices, risk management, and safety features"\n'
+            "}}"
+        ),
+    },
+    {
+        "name": "user_experience_advocate",
+        "title": "User Experience Advocate",
+        "description": (
+            "You evaluate projects based on their user experience, accessibility, and potential "
+            "to onboard new users to DeFi. You focus on usability and integration capabilities."
+        ),
+        "prompt": (
+            "As a User Experience Advocate, assess the project's usability and accessibility.\n\n"
+            "Summary: {summary}\n"
+            "Stars: {star_count} | Forks: {fork_count}\n"
+            "Created: {created_at} | Updated: {updated_at}\n\n"
+            "Select the category that best represents its user experience focus:\n"
+            "{categories}\n\n"
+            "Respond in JSON:\n"
+            "{{\n"
+            '  "assigned_tag": "category name",\n'
+            '  "reason": "analysis of user experience, accessibility, and onboarding potential"\n'
+            "}}"
+        ),
+    },
+    {
+        "name": "governance_specialist",
+        "title": "Governance & Decentralization Specialist",
+        "description": (
+            "You analyze projects based on their governance mechanisms, decentralization approach, "
+            "and contribution to sustainable protocol management."
+        ),
+        "prompt": (
+            "As a Governance & Decentralization Specialist, evaluate the project's governance model.\n\n"
+            "Summary: {summary}\n"
+            "Stars: {star_count} | Forks: {fork_count}\n"
+            "Created: {created_at} | Updated: {updated_at}\n\n"
+            "Choose the category that best reflects its governance and decentralization approach:\n"
+            "{categories}\n\n"
+            "Respond in JSON:\n"
+            "{{\n"
+            '  "assigned_tag": "category name",\n'
+            '  "reason": "analysis of governance mechanisms, decentralization, and sustainability"\n'
+            "}}"
+        ),
+    }
+]
diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/summary_prompts.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/summary_prompts.py
new file mode 100644
index 00000000..cdd27bfb
--- /dev/null
+++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/summary_prompts.py
@@ -0,0 +1,32 @@
+SUMMARY_PROMPT = (
+    "You are an analyst preparing short, neutral briefs on open-source projects.  "
+    "Read the README below and write a **concise, 2- to 3-sentence summary** that:\n"
+    "• states the project’s core purpose / problem it solves\n"
+    "• lists its main capabilities or components (1–3 key points only)\n"
+    "• mentions the primary intended users or systems (e.g., smart-contract developers, node operators)\n"
+    "• notes any strongly signalled context such as supported programming language, network, or runtime\n"
+    "\n"
+    "**Style constraints**\n"
+    "• Use plain, factual language in third person (no hype, no marketing adjectives).\n"
+    "• **Do not** guess or invent details that are not explicit in the README.\n"
+    "• **Do not** label the project with, or copy wording from, the taxonomy below (to avoid category leakage).\n"
+    "• Limit the summary to <100 words; avoid bullet lists or line breaks.\n"
+    "\n"
+    "Return your answer as **exactly one valid JSON object** in this form (nothing extra):\n"
+    "{{\n"
+    '  \"summary\": \"your summary here\"\n'
+    "}}\n"
+    "\n"
+    "README:\n"
+    "{readme_md}"
+)
+
+TAGS_PROMPT = (
+    "Based on this project summary, generate a list of relevant tags that "
+    "describe the project's purpose and functionality.\n\n"
+    "You must respond with a valid JSON object in this exact format:\n"
+    "{{\n"
+    '    "tags": ["tag1", "tag2", "tag3"]\n'
+    "}}\n\n"
+    "Summary:\n{summary}"
+)
diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/settings.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/settings.py
new file mode 100644
index 00000000..79c7ebb0
--- /dev/null
+++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/settings.py
@@ -0,0 +1,26 @@
+import os
+from pathlib import Path
+from dotenv import load_dotenv
+
+# Load environment variables from .env file
+load_dotenv()
+
+# API Keys
+OSO_API_KEY = os.getenv("OSO_API_KEY")
+GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
+GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
+
+# Project paths
+PROJECT_ROOT = Path(__file__).parent.parent.parent
+DATA_DIR = PROJECT_ROOT / "data"
+OUTPUT_DIR = PROJECT_ROOT / "output"
+
+# Create directories if they don't exist
+DATA_DIR.mkdir(exist_ok=True)
+OUTPUT_DIR.mkdir(exist_ok=True)
+
+# GitHub API settings
+GITHUB_HEADERS = {"Authorization": f"token {GITHUB_TOKEN}"}
+
+# Gemini model settings
+GEMINI_MODEL = "gemini-2.0-flash"
\ No newline at end of file
diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/__init__.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/__init__.py
new file mode 100644
index 00000000..51e9f286
--- /dev/null
+++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/__init__.py
@@ -0,0 +1,15 @@
+# This file makes the 'pipeline' directory a Python package.
+
+from .data_manager import DataManager
+from .repository_fetcher import RepositoryFetcherStep
+from .summary_generator import SummaryGeneratorStep
+from .categorizer import CategorizerStep
+from .consolidator import ConsolidatorStep
+
+__all__ = [
+    "DataManager",
+    "RepositoryFetcherStep",
+    "SummaryGeneratorStep",
+    "CategorizerStep",
+    "ConsolidatorStep",
+]
diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/categorizer.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/categorizer.py
new file mode 100644
index 00000000..67762e9a
--- /dev/null
+++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/categorizer.py
@@ -0,0 +1,172 @@
+import pandas as pd
+from tqdm import tqdm
+from .data_manager import DataManager
+from ..config.config_manager import ConfigManager
+from ..processing.ai_service import AIService, ClassificationOutput
+
+class CategorizerStep:
+    def __init__(self, data_manager: DataManager, config_manager: ConfigManager, ai_service: AIService):
+        self.data_manager = data_manager
+        self.config_manager = config_manager
+        self.ai_service = ai_service
+
+    def run(self, force_refresh: bool = False, target_persona_name: str = None, new_only: bool = False):
+        """
+        Categorize projects using AI personas.
+        Uses batch_size_categorization from config.
+        
+        Args:
+            force_refresh: If True, wipe existing categories and regenerate all
+            target_persona_name: If specified, only process this persona
+            new_only: If True, only categorize repositories that don't have categories yet
+        """
+        batch_size = self.config_manager.get_batch_size_categorization()
+        
+        if force_refresh:
+            if target_persona_name:
+                print(f"Force refresh enabled for persona '{target_persona_name}'. Wiping existing category data for this persona.")
+                self.data_manager.wipe_categories_data(persona_name=target_persona_name)
+            else:
+                print("Force refresh enabled for all personas. Wiping all existing category data.")
+                self.data_manager.wipe_categories_data()
+
+        # Get summaries data
+        summaries_df = self.data_manager.get_summaries_data()
+        if summaries_df.empty:
+            print("No summarized data found to categorize. Skipping.")
+            return pd.DataFrame()
+        
+        if 'summary' not in summaries_df.columns:
+            print("Error: 'summary' column not found in summarized data. Cannot categorize.")
+            return pd.DataFrame()
+        if 'repo_artifact_id' not in summaries_df.columns:
+            print("Error: 'repo_artifact_id' not found in summarized data.")
+            return pd.DataFrame()
+
+        # Get personas to process
+        personas_to_process = []
+        if target_persona_name:
+            persona = self.config_manager.get_persona(target_persona_name)
+            if persona:
+                personas_to_process = [persona]
+            else:
+                print(f"Error: Persona '{target_persona_name}' not found.")
+                return pd.DataFrame()
+        else:
+            personas_to_process = self.config_manager.get_personas()
+
+        if not personas_to_process:
+            print("No personas found to process.")
+            return pd.DataFrame()
+
+        # Process each persona
+        for persona in personas_to_process:
+            persona_name = persona['name']
+            print(f"\nProcessing persona: {persona_name}")
+            
+            # Get existing categories for this persona if any
+            existing_categories_df = pd.DataFrame()
+            if not force_refresh:
+                try:
+                    existing_categories_df = self.data_manager.get_categories_data(persona_name)
+                except FileNotFoundError:
+                    pass  # No existing categories for this persona
+
+            # If we have existing categories and not forcing refresh
+            if not existing_categories_df.empty and not force_refresh:
+                if new_only:
+                    # Filter out repositories that already have categories
+                    existing_repos = set(existing_categories_df['repo_artifact_id'])
+                    repos_to_process = summaries_df[~summaries_df['repo_artifact_id'].isin(existing_repos)]
+                    if repos_to_process.empty:
+                        print(f"No new repositories found to categorize for persona '{persona_name}'.")
+                        continue
+                    print(f"Found {len(repos_to_process)} new repositories to categorize for persona '{persona_name}'.")
+                else:
+                    print(f"Categories already exist for persona '{persona_name}' and force_refresh is false. Skipping.")
+                    continue
+            else:
+                repos_to_process = summaries_df
+
+            # Process in batches
+            all_categorized_data = []
+            for start_idx in tqdm(range(0, len(repos_to_process), batch_size), desc=f"Categorizing ({persona_name})", leave=False):
+                end_idx = min(start_idx + batch_size, len(repos_to_process))
+                batch_df = repos_to_process.iloc[start_idx:end_idx]
+                
+                # Prepare list of dicts, each containing summary and metadata for a project
+                project_data_batch = []
+                required_metadata_cols = ['star_count', 'fork_count', 'created_at', 'updated_at']
+                for _, row in batch_df.iterrows():
+                    project_data = {
+                        'summary': row.get('summary', ''),
+                        'repo_artifact_id': row.get('repo_artifact_id', 'UNKNOWN_ID') 
+                    }
+                    for col in required_metadata_cols:
+                        project_data[col] = row.get(col) # Will be None if missing, pandas NaT for dates
+                    project_data_batch.append(project_data)
+
+                if not project_data_batch or all(not item['summary'] for item in project_data_batch):
+                    print(f"Skipping batch for {persona_name} as all summaries are effectively empty.")
+                    classifications = [ClassificationOutput(assigned_tag="N/A", reason="Empty summary or batch")] * len(project_data_batch)
+                else:
+                    classifications: List[ClassificationOutput] = self.ai_service.classify_projects_batch_for_persona(
+                        project_data_batch,
+                        persona
+                    )
+                
+                # Create a temporary DataFrame for this batch's results
+                temp_batch_df = batch_df.copy() 
+                temp_batch_df[f"{persona_name}_tag"] = [c.assigned_tag for c in classifications]
+                temp_batch_df[f"{persona_name}_reason"] = [c.reason for c in classifications]
+                all_categorized_data.append(temp_batch_df)
+
+            if not all_categorized_data:
+                print(f"No categories were generated for persona '{persona_name}'.")
+                continue
+
+            new_categories_df = pd.concat(all_categorized_data, ignore_index=True)
+            
+            # If we have existing categories and not forcing refresh, combine with new ones
+            if not existing_categories_df.empty and not force_refresh:
+                final_categories_df = pd.concat([existing_categories_df, new_categories_df], ignore_index=True)
+                # Remove any duplicates that might have been introduced
+                final_categories_df = final_categories_df.drop_duplicates(
+                    subset=['repo_artifact_id'],
+                    keep='last'  # Keep the new categorization if there was a duplicate
+                )
+                print(f"Combined data now contains {len(final_categories_df)} repositories with categories for persona '{persona_name}'.")
+            else:
+                final_categories_df = new_categories_df
+                
+            self.data_manager.save_categories_data(final_categories_df, persona_name)
+
+        return pd.DataFrame()  # Return empty DataFrame as we've saved the data
+
+
+if __name__ == '__main__':
+    # Example Usage
+    cfg_manager = ConfigManager()
+    ai_svc = AIService(config_manager=cfg_manager)
+    output_dir = cfg_manager.get_output_dir()
+    dt_manager = DataManager(output_dir=output_dir, config=cfg_manager)
+
+    if dt_manager.get_summaries_data().empty:
+        print("No summarized data found. Please run SummaryGeneratorStep first or ensure data exists.")
+    else:
+        categorizer_step = CategorizerStep(
+            data_manager=dt_manager,
+            config_manager=cfg_manager,
+            ai_service=ai_svc
+        )
+        print("\nRunning CategorizerStep...")
+        # Set force_refresh=True to re-categorize.
+        # Specify target_persona_name="keyword_spotter" to only run for one.
+        categorized_data = categorizer_step.run(force_refresh=False, target_persona_name=None) 
+        
+        if not categorized_data.empty:
+            print(f"\nCategorized data head:\n{categorized_data.head()}")
+            print(f"Number of rows in categorized data: {len(categorized_data)}")
+            print(f"Columns: {categorized_data.columns.tolist()}")
+        else:
+            print("No data returned from categorization step.")
diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/consolidator.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/consolidator.py
new file mode 100644
index 00000000..e4fe539f
--- /dev/null
+++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/consolidator.py
@@ -0,0 +1,183 @@
+import pandas as pd
+import numpy as np
+from .data_manager import DataManager
+from ..config.config_manager import ConfigManager
+# from ..config.prompts.tag_mappings import TAG_TO_CATEGORY # Removed
+
+class ConsolidatorStep:
+    def __init__(self, data_manager: DataManager, config_manager: ConfigManager):
+        self.data_manager = data_manager
+        self.config_manager = config_manager
+
+    def run(self):
+        """Consolidate and analyze the classification results from all personas."""
+        print("\nConsolidating analysis...")
+
+        # Get the merged data from all personas
+        # DataManager's get_categories_data() without persona_name should provide this.
+        categorized_df = self.data_manager.get_categories_data()
+
+        if categorized_df.empty:
+            print("No categorized data found to consolidate. Skipping.")
+            return pd.DataFrame()
+
+        # Ensure essential columns are present
+        if 'repo_artifact_id' not in categorized_df.columns and 'project_id' not in categorized_df.columns:
+            print("Error: 'repo_artifact_id' or 'project_id' not found in categorized data.")
+            return pd.DataFrame()
+        
+        # Use 'project_id' for grouping if available, else 'repo_artifact_id'
+        # The original code used 'project_id' for project-level aggregation.
+        # The raw data from OSO has 'project_id'. Summaries and categories should retain it.
+        
+        # Identify persona tag columns
+        personas = self.config_manager.get_personas()
+        persona_tag_cols = [f"{persona['name']}_tag" for persona in personas if f"{persona['name']}_tag" in categorized_df.columns]
+
+        if not persona_tag_cols:
+            print("No persona tag columns found in the categorized data. Cannot consolidate.")
+            return categorized_df # Return as is, or an empty DF
+
+        # Fill NaNs in numeric columns that might be used for weighting (e.g., star_count)
+        # These columns should ideally come from the raw_repos_data or summaries_data.
+        # The categorized_df from DataManager should already have these if merged correctly.
+        numeric_cols_to_fill = ['star_count', 'fork_count', 'num_packages_in_deps_dev']
+        for col in numeric_cols_to_fill:
+            if col in categorized_df.columns:
+                categorized_df[col] = categorized_df[col].fillna(0)
+            else:
+                # If star_count is missing, we can't do weighted summary as originally designed.
+                # For now, we'll proceed without it if missing.
+                print(f"Warning: Column '{col}' not found for consolidation. Weighted summary might be affected.")
+        
+        # Drop readme_md if it exists, as it's large and not needed for consolidation
+        if 'readme_md' in categorized_df.columns:
+            categorized_df = categorized_df.drop(columns=['readme_md'])
+
+        # Group by project_id to consolidate recommendations
+        # Define grouping keys. project_id is essential.
+        grouping_keys = ['project_id']
+        # Add other descriptive columns that should be unique per project or take the first
+        if 'display_name' in categorized_df.columns: grouping_keys.append('display_name')
+        if 'atlas_id' in categorized_df.columns: grouping_keys.append('atlas_id')
+        
+        # Ensure grouping keys are valid and exist in the DataFrame
+        valid_grouping_keys = [key for key in grouping_keys if key in categorized_df.columns]
+        if 'project_id' not in valid_grouping_keys:
+            print("Critical error: 'project_id' is missing. Cannot perform project-level consolidation.")
+            # Save the repo-level data with repo-level recommendations if project_id is missing
+            # This part re-uses the previous logic for repo-level recommendation if grouping fails
+            repo_recommendations = []
+            if not categorized_df.empty and persona_tag_cols:
+                for index, row in categorized_df.iterrows():
+                    assignments = [row[col] for col in persona_tag_cols if pd.notna(row[col]) and row[col] not in ["Error", "N/A", "Other"]]
+                    if assignments:
+                        mode_series = pd.Series(assignments).mode()
+                        repo_recommendations.append(mode_series[0] if not mode_series.empty else 'Other')
+                    else:
+                        repo_recommendations.append('Other')
+                categorized_df['recommendation'] = repo_recommendations
+            else:
+                categorized_df['recommendation'] = 'Other'
+            self.data_manager.save_consolidated_data(categorized_df)
+            print("Consolidated analysis saved (repo-level due to missing project_id).")
+            return categorized_df
+
+        print(f"Consolidating at project level using keys: {valid_grouping_keys}")
+
+        def aggregate_project_data(group):
+            # New logic for star-weighted recommendation
+            category_star_weights = {} # Stores sum of stars for each category
+
+            for _, repo_row in group.iterrows(): # Iterate over each repo in the project
+                stars = repo_row.get('star_count', 0) # star_count was already filled with 0 for NaNs
+                
+                # Ensure stars is a non-negative number (already handled by fillna(0) but good practice)
+                if pd.isna(stars) or not isinstance(stars, (int, float)) or stars < 0:
+                    stars = 0
+                else:
+                    stars = int(stars) # Ensure it's an integer for summation
+                
+                for p_col in persona_tag_cols: # Iterate over each persona's tag column
+                    category = repo_row.get(p_col)
+                    # Check if category is valid
+                    if pd.notna(category) and category not in ["Error", "N/A", "Other"]:
+                        category_star_weights[category] = category_star_weights.get(category, 0) + stars
+
+            if not category_star_weights:
+                recommendation = 'Other'
+            else:
+                # Find the category with the maximum accumulated star weight
+                # pd.Series(category_star_weights).idxmax() returns the category (index) with the max value
+                recommendation = pd.Series(category_star_weights).idxmax()
+
+            # Aggregate other fields
+            agg_data = {
+                'recommendation': recommendation,
+                'repo_artifact_namespaces': list(group['repo_artifact_namespace'].unique()) if 'repo_artifact_namespace' in group else [],
+                'repo_count': group['repo_artifact_id'].nunique() if 'repo_artifact_id' in group else 0,
+                'total_stars': group['star_count'].sum() if 'star_count' in group else 0,
+                'total_forks': group['fork_count'].sum() if 'fork_count' in group else 0,
+                # Add summaries of the top N repos or a combined summary if needed
+                # For now, let's take the summary of the first repo in the group (by original order)
+                'sample_summary': group['summary'].iloc[0] if 'summary' in group and not group['summary'].empty else ""
+            }
+            # Add persona tags for the project (e.g., mode of each persona's tags for this project)
+            for p_col in persona_tag_cols:
+                persona_project_tags = group[p_col].dropna().tolist()
+                valid_persona_tags = [tag for tag in persona_project_tags if tag not in ["Error", "N/A", "Other"]]
+                if valid_persona_tags:
+                    agg_data[f"{p_col}_mode"] = pd.Series(valid_persona_tags).mode()[0] if pd.Series(valid_persona_tags).mode().any() else "N/A"
+                else:
+                    agg_data[f"{p_col}_mode"] = "N/A"
+
+            return pd.Series(agg_data)
+
+        # Group by valid_grouping_keys and apply aggregation
+        # Use as_index=False if valid_grouping_keys are to be columns, otherwise they become index
+        project_consolidated_df = categorized_df.groupby(valid_grouping_keys, as_index=False).apply(aggregate_project_data)
+        
+        # If groupby().apply() changes the structure unexpectedly (e.g. multi-index if as_index=True was used)
+        # ensure project_consolidated_df is flat. With as_index=False, it should be.
+        # If aggregate_project_data returns a Series, and groupby has as_index=False,
+        # the result should be a DataFrame where grouping keys are columns, and new columns from Series.
+        # If apply returns a DataFrame, it might need reset_index().
+        # Let's ensure it's flat:
+        if not isinstance(project_consolidated_df.index, pd.RangeIndex):
+             project_consolidated_df = project_consolidated_df.reset_index()
+
+
+        final_df = project_consolidated_df
+
+        # Save results
+        print(f"\nSaving consolidated analysis (project-level)...")
+        self.data_manager.save_consolidated_data(final_df)
+        print("Consolidated analysis saved successfully.")
+        return final_df
+
+if __name__ == '__main__':
+    # Example Usage
+    cfg_manager = ConfigManager()
+    output_dir = cfg_manager.get_output_dir()
+    dt_manager = DataManager(output_dir=output_dir, config=cfg_manager)
+
+    # Ensure categorized data exists (run categorizer.py example first if needed)
+    # DataManager's get_categories_data() should merge individual persona files.
+    if dt_manager.get_categories_data().empty:
+         print("No categorized data found. Please run CategorizerStep first or ensure data exists.")
+    else:
+        consolidator_step = ConsolidatorStep(
+            data_manager=dt_manager,
+            config_manager=cfg_manager
+        )
+        print("\nRunning ConsolidatorStep...")
+        consolidated_df = consolidator_step.run()
+        
+        if not consolidated_df.empty:
+            print(f"\nConsolidated data head:\n{consolidated_df.head()}")
+            print(f"Number of rows in consolidated data: {len(consolidated_df)}")
+            print(f"Consolidated columns: {consolidated_df.columns.tolist()}")
+            print(f"\nRecommendations sample:\n{consolidated_df[['project_id', 'display_name', 'recommendation']].head() if 'project_id' in consolidated_df.columns and 'display_name' in consolidated_df.columns else consolidated_df['recommendation'].head()}")
+
+        else:
+            print("No data returned from consolidation step.")
diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/data_manager.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/data_manager.py
new file mode 100644
index 00000000..035e1caf
--- /dev/null
+++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/data_manager.py
@@ -0,0 +1,391 @@
+import pandas as pd
+import json
+from pathlib import Path
+import shutil
+from typing import List, Dict, Any
+from ..config.settings import PROJECT_ROOT
+
+class DataManager:
+    def __init__(self, output_dir: Path, config=None):
+        self.output_dir = output_dir
+        self.config = config  # For future use, e.g., different storage backends
+        
+        # Legacy paths
+        self.raw_parquet_path = self.output_dir / "devtooling_raw.parquet"
+        self.summarized_parquet_path = self.output_dir / "devtooling_summarized.parquet"
+        self.categorized_dir = self.output_dir / "categorized"
+        self.final_parquet_path = self.output_dir / "devtooling_full.parquet"
+        self.consolidated_csv_path = self.output_dir / "devtooling_consolidated.csv"
+        
+        # New unified data paths - ensure they're in the current repo's output directory
+        local_output_dir = Path(PROJECT_ROOT) / "output"
+        local_output_dir.mkdir(parents=True, exist_ok=True)
+        self.unified_parquet_path = local_output_dir / "ethereum_repos_unified.parquet"
+        self.unified_csv_path = local_output_dir / "ethereum_repos_unified.csv"
+
+        # Create directories if they don't exist
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.categorized_dir.mkdir(parents=True, exist_ok=True)
+
+    def get_repos_data(self) -> pd.DataFrame:
+        """Get the latest repository data"""
+        if self.raw_parquet_path.exists():
+            return pd.read_parquet(self.raw_parquet_path)
+        return pd.DataFrame()
+
+    def get_summaries_data(self) -> pd.DataFrame:
+        """Get the latest summaries data"""
+        if self.summarized_parquet_path.exists():
+            return pd.read_parquet(self.summarized_parquet_path)
+        return pd.DataFrame()
+
+    def get_categories_data(self, persona_name: str = None) -> pd.DataFrame:
+        """Get the latest categories data, optionally for a specific persona or all."""
+        if persona_name:
+            persona_file = self.categorized_dir / f"{persona_name}.parquet"
+            if persona_file.exists():
+                return pd.read_parquet(persona_file)
+            return pd.DataFrame()
+        else:
+            # Combine all persona files
+            all_persona_dfs = []
+            for persona_file in self.categorized_dir.glob("*.parquet"):
+                df = pd.read_parquet(persona_file)
+                all_persona_dfs.append(df)
+            
+            if not all_persona_dfs:
+                return pd.DataFrame()
+
+            # Concatenate all dataframes. If a project appears in multiple files,
+            # the last one read will take precedence for shared columns (like 'summary').
+            # Persona-specific columns (e.g., 'persona_X_tag') will be unique.
+            # We need a more robust way to merge these if there are overlapping non-persona columns.
+            # For now, assuming 'project_id' or 'repo_artifact_id' is the key.
+            
+            # A simple concat might lead to duplicate columns if not handled carefully.
+            # Let's assume each persona file has unique columns for its tags/reasons.
+            # And common columns like 'project_id', 'summary' are present.
+            
+            # Start with the summaries data as the base
+            base_df = self.get_summaries_data()
+            if base_df.empty:
+                 # If no summaries, try to load from the first persona file as a base
+                if all_persona_dfs:
+                    base_df = all_persona_dfs[0][['project_id', 'repo_artifact_id', 'summary']].copy() # Adjust columns as needed
+                else:
+                    return pd.DataFrame()
+
+
+            # Set index for joining
+            if 'repo_artifact_id' in base_df.columns:
+                base_df = base_df.set_index('repo_artifact_id')
+            elif 'project_id' in base_df.columns:
+                 base_df = base_df.set_index('project_id')
+            else:
+                # Fallback if no clear index, this might lead to issues
+                print("Warning: No clear index (project_id or repo_artifact_id) for merging category data.")
+
+
+            for df_persona in all_persona_dfs:
+                # Identify the persona name from its columns (e.g., "keyword_spotter_tag")
+                current_persona_name = None
+                for col_name in df_persona.columns:
+                    if col_name.endswith("_tag"):
+                        current_persona_name = col_name.replace("_tag", "")
+                        break
+                
+                if not current_persona_name:
+                    print(f"Warning: Could not determine persona name from columns in a categorized file. Skipping this file.")
+                    continue
+
+                # Columns to join are just the tag and reason for this specific persona
+                persona_tag_col = f"{current_persona_name}_tag"
+                persona_reason_col = f"{current_persona_name}_reason"
+                
+                cols_from_persona_df = []
+                if persona_tag_col in df_persona.columns:
+                    cols_from_persona_df.append(persona_tag_col)
+                if persona_reason_col in df_persona.columns:
+                    cols_from_persona_df.append(persona_reason_col)
+
+                if not cols_from_persona_df:
+                    print(f"Warning: No tag/reason columns found for persona {current_persona_name} in its file. Skipping join for this persona.")
+                    continue
+                
+                # Set index for df_persona before selecting columns for join
+                if base_df.index.name in df_persona.columns: # base_df.index.name is 'repo_artifact_id' or 'project_id'
+                    df_persona_indexed = df_persona.set_index(base_df.index.name)
+                else:
+                    print(f"Warning: Index column '{base_df.index.name}' not found in persona DataFrame for {current_persona_name}. Attempting join without re-indexing persona df, might be incorrect.")
+                    df_persona_indexed = df_persona # This might lead to issues if not indexed properly
+
+                # Ensure only existing columns are selected from df_persona_indexed
+                valid_cols_to_join = [col for col in cols_from_persona_df if col in df_persona_indexed.columns]
+
+                if not valid_cols_to_join:
+                     print(f"Warning: Persona specific columns {cols_from_persona_df} not found as actual columns in indexed persona dataframe for {current_persona_name}. Skipping join for this persona.")
+                     continue
+                
+                base_df = base_df.join(df_persona_indexed[valid_cols_to_join], how='left', rsuffix=f'_{current_persona_name}_dup')
+                
+                # Clean up duplicate columns if any (this is a basic cleanup for rsuffix)
+                cols_to_drop = [col for col in base_df.columns if '_dup' in col]
+                base_df.drop(columns=cols_to_drop, inplace=True, errors='ignore')
+
+            return base_df.reset_index()
+
+
+    def save_repos_data(self, data: pd.DataFrame):
+        """Save repository data"""
+        data.to_parquet(self.raw_parquet_path, index=False)
+        print(f"Repository data saved to {self.raw_parquet_path}")
+
+    def save_summaries_data(self, data: pd.DataFrame, append: bool = False):
+        """Save summaries data. If append is True, appends to existing file if it exists."""
+        if append and self.summarized_parquet_path.exists():
+            existing_df = pd.read_parquet(self.summarized_parquet_path)
+            # Ensure no duplicate columns before concat, especially if 'summary' is regenerated
+            # A more robust merge/update might be needed depending on exact requirements
+            data_to_save = pd.concat([existing_df, data]).drop_duplicates(subset=['repo_artifact_id'], keep='last') # Assuming repo_artifact_id is unique key
+        else:
+            data_to_save = data
+        data_to_save.to_parquet(self.summarized_parquet_path, index=False)
+        print(f"Summaries data saved to {self.summarized_parquet_path}")
+
+    def save_categories_data(self, data: pd.DataFrame, persona_name: str):
+        """Save categories data for a specific persona"""
+        persona_file = self.categorized_dir / f"{persona_name}.parquet"
+        data.to_parquet(persona_file, index=False)
+        print(f"Categories data for persona {persona_name} saved to {persona_file}")
+        
+    def save_consolidated_data(self, data: pd.DataFrame):
+        """Save consolidated data to Parquet and CSV"""
+        data.to_parquet(self.final_parquet_path, index=False)
+        print(f"Consolidated Parquet data saved to {self.final_parquet_path}")
+        data.to_csv(self.consolidated_csv_path, index=False)
+        print(f"Consolidated CSV data saved to {self.consolidated_csv_path}")
+
+    def wipe_repos_data(self):
+        """Wipe repository data"""
+        if self.raw_parquet_path.exists():
+            self.raw_parquet_path.unlink()
+            print(f"Wiped repository data: {self.raw_parquet_path}")
+
+    def wipe_summaries_data(self):
+        """Wipe summaries data"""
+        if self.summarized_parquet_path.exists():
+            self.summarized_parquet_path.unlink()
+            print(f"Wiped summaries data: {self.summarized_parquet_path}")
+
+    def wipe_categories_data(self, persona_name: str = None):
+        """Wipe categories data, optionally for a specific persona or all."""
+        if persona_name:
+            persona_file = self.categorized_dir / f"{persona_name}.parquet"
+            if persona_file.exists():
+                persona_file.unlink()
+                print(f"Wiped categories data for persona {persona_name}: {persona_file}")
+        else:
+            if self.categorized_dir.exists():
+                shutil.rmtree(self.categorized_dir)
+                self.categorized_dir.mkdir(parents=True, exist_ok=True) # Recreate after wiping
+                print(f"Wiped all categories data in {self.categorized_dir}")
+                
+    def has_categories_for_persona(self, persona_name: str) -> bool:
+        """Check if category data exists for a specific persona."""
+        persona_file = self.categorized_dir / f"{persona_name}.parquet"
+        return persona_file.exists()
+
+    def get_final_parquet_path(self) -> Path:
+        return self.final_parquet_path
+
+    def get_consolidated_csv_path(self) -> Path:
+        return self.consolidated_csv_path
+        
+    # New methods for unified data structure
+    
+    def save_unified_data(self, data: pd.DataFrame):
+        """
+        Save unified repository data to Parquet and CSV.
+        This data includes all repositories, summaries, and categorizations in a single structure.
+        """
+        # Ensure categorizations column is properly serialized for Parquet
+        if 'categorizations' in data.columns:
+            # Convert categorizations to strings for storage
+            # This is necessary because Parquet doesn't handle complex nested structures well
+            data_copy = data.copy()
+            data_copy['categorizations_json'] = data_copy['categorizations'].apply(
+                lambda x: json.dumps(x) if isinstance(x, list) else '[]'
+            )
+            
+            # Save to Parquet (without the original categorizations column)
+            parquet_data = data_copy.drop(columns=['categorizations'])
+            parquet_data.to_parquet(self.unified_parquet_path, index=False)
+            print(f"Unified data saved to {self.unified_parquet_path}")
+            
+            # Save to CSV for easier viewing (also without the complex column)
+            csv_data = parquet_data.copy()
+            
+            # Remove README text and truncate long text fields for CSV readability
+            if 'readme_md' in csv_data.columns:
+                csv_data = csv_data.drop(columns=['readme_md'])
+                
+            if 'summary' in csv_data.columns:
+                csv_data['summary'] = csv_data['summary'].apply(
+                    lambda x: (x[:100] + '...') if isinstance(x, str) and len(x) > 100 else x
+                )
+                
+            # Truncate other potentially long text fields
+            for col in ['categorizations_json']:
+                if col in csv_data.columns:
+                    csv_data[col] = csv_data[col].apply(
+                        lambda x: (x[:50] + '...') if isinstance(x, str) and len(x) > 50 else x
+                    )
+                    
+            csv_data.to_csv(self.unified_csv_path, index=False)
+            print(f"Unified CSV data saved to {self.unified_csv_path} (README text removed)")
+        else:
+            # If no categorizations column, save as is
+            data.to_parquet(self.unified_parquet_path, index=False)
+            print(f"Unified data saved to {self.unified_parquet_path}")
+            
+            # Create a readable CSV version
+            csv_data = data.copy()
+            
+            # Remove README text and truncate long text fields for CSV readability
+            if 'readme_md' in csv_data.columns:
+                csv_data = csv_data.drop(columns=['readme_md'])
+                
+            if 'summary' in csv_data.columns:
+                csv_data['summary'] = csv_data['summary'].apply(
+                    lambda x: (x[:100] + '...') if isinstance(x, str) and len(x) > 100 else x
+                )
+                
+            csv_data.to_csv(self.unified_csv_path, index=False)
+            print(f"Unified CSV data saved to {self.unified_csv_path} (README text removed)")
+    
+    def get_unified_data(self) -> pd.DataFrame:
+        """
+        Get the unified repository data with properly deserialized categorizations.
+        """
+        if not self.unified_parquet_path.exists():
+            return pd.DataFrame()
+            
+        # Load the data from Parquet
+        data = pd.read_parquet(self.unified_parquet_path)
+        
+        # Deserialize the categorizations from JSON if present
+        if 'categorizations_json' in data.columns:
+            data['categorizations'] = data['categorizations_json'].apply(
+                lambda x: json.loads(x) if isinstance(x, str) else []
+            )
+            data = data.drop(columns=['categorizations_json'])
+            
+        return data
+    
+    def append_unified_data(self, new_repo_data: pd.DataFrame) -> None:
+        """
+        Append a single repository or multiple repositories to the existing unified data.
+        
+        Args:
+            new_repo_data: DataFrame containing the new repository data to append
+        """
+        if new_repo_data.empty:
+            return
+            
+        existing_data = self.get_unified_data()
+        
+        if existing_data.empty:
+            # If no existing data, just save the new data
+            self.save_unified_data(new_repo_data)
+            return
+            
+        # Combine existing and new data
+        combined_data = pd.concat([existing_data, new_repo_data], ignore_index=True)
+        
+        # Remove duplicates based on repo_artifact_id, keeping the newest version
+        combined_data = combined_data.sort_values('processing_timestamp', ascending=False)
+        combined_data = combined_data.drop_duplicates(subset=['repo_artifact_id'], keep='first')
+        
+        # Save the combined data
+        self.save_unified_data(combined_data)
+        
+    def update_unified_data(self, updated_repo_data: pd.DataFrame) -> None:
+        """
+        Update specific repositories in the existing unified data.
+        
+        Args:
+            updated_repo_data: DataFrame containing the updated repository data
+        """
+        if updated_repo_data.empty:
+            return
+            
+        existing_data = self.get_unified_data()
+        
+        if existing_data.empty:
+            # If no existing data, just save the updated data
+            self.save_unified_data(updated_repo_data)
+            return
+            
+        # Get the repo_artifact_ids of the updated repositories
+        updated_ids = set(updated_repo_data['repo_artifact_id'])
+        
+        # Remove the repositories that are being updated from the existing data
+        filtered_existing = existing_data[~existing_data['repo_artifact_id'].isin(updated_ids)]
+        
+        # Combine the filtered existing data with the updated data
+        combined_data = pd.concat([filtered_existing, updated_repo_data], ignore_index=True)
+        
+        # Save the combined data
+        self.save_unified_data(combined_data)
+    
+    def wipe_unified_data(self):
+        """Wipe unified data files"""
+        if self.unified_parquet_path.exists():
+            self.unified_parquet_path.unlink()
+            print(f"Wiped unified data: {self.unified_parquet_path}")
+        if self.unified_csv_path.exists():
+            self.unified_csv_path.unlink()
+            print(f"Wiped unified CSV data: {self.unified_csv_path}")
+            
+    def get_checkpoint_path(self) -> Path:
+        """Get the path to the processing checkpoint file"""
+        local_output_dir = Path(PROJECT_ROOT) / "output"
+        local_output_dir.mkdir(parents=True, exist_ok=True)
+        return local_output_dir / "processing_checkpoint.json"
+        
+    def save_checkpoint(self, checkpoint_data: Dict[str, Any]) -> None:
+        """
+        Save the processing checkpoint data to a JSON file.
+        
+        Args:
+            checkpoint_data: Dictionary containing checkpoint information
+        """
+        checkpoint_path = self.get_checkpoint_path()
+        with open(checkpoint_path, 'w') as f:
+            json.dump(checkpoint_data, f, indent=2)
+            
+    def load_checkpoint(self) -> Dict[str, Any]:
+        """
+        Load the processing checkpoint data from a JSON file.
+        
+        Returns:
+            Dictionary containing checkpoint information, or empty dict if no checkpoint exists
+        """
+        checkpoint_path = self.get_checkpoint_path()
+        if not checkpoint_path.exists():
+            return {
+                "last_processed_repo_id": None,
+                "processed_repos": [],
+                "partial_results": {}
+            }
+            
+        try:
+            with open(checkpoint_path, 'r') as f:
+                return json.load(f)
+        except Exception as e:
+            print(f"Error loading checkpoint: {e}")
+            return {
+                "last_processed_repo_id": None,
+                "processed_repos": [],
+                "partial_results": {}
+            }
diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/repository_fetcher.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/repository_fetcher.py
new file mode 100644
index 00000000..fc5f398f
--- /dev/null
+++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/repository_fetcher.py
@@ -0,0 +1,125 @@
+import pandas as pd
+from .data_manager import DataManager
+from ..config.config_manager import ConfigManager
+from ..processing.fetcher import DataFetcher
+
+class RepositoryFetcherStep:
+    def __init__(self, data_manager: DataManager, config_manager: ConfigManager):
+        self.data_manager = data_manager
+        self.config_manager = config_manager
+        self.fetcher = DataFetcher() # Assuming DataFetcher doesn't need config for initialization
+
+    def run(self, force_refresh: bool = False, fetch_new_only: bool = False):
+        """
+        Fetch repositories and READMEs.
+        Uses test_mode and test_mode_limit from config if test_mode is enabled.
+        
+        Args:
+            force_refresh: If True, wipe existing data and fetch everything fresh
+            fetch_new_only: If True, only fetch repositories that don't exist in current data
+        """
+        limit = None
+        sort_by_stars_in_test = False
+        is_test = self.config_manager.is_test_mode()
+
+        if is_test:
+            limit = self.config_manager.get_test_mode_limit()
+            sort_by_stars_in_test = True # Always sort by stars in test mode as per new req
+            print(f"Running in TEST MODE: Targeting up to {limit} repositories, sorted by stars DESC.")
+
+        if force_refresh:
+            print("Force refresh enabled for repository data. Wiping existing raw data.")
+            self.data_manager.wipe_repos_data()
+            existing_df = pd.DataFrame()
+        else:
+            existing_df = self.data_manager.get_repos_data()
+            if not existing_df.empty:
+                if fetch_new_only:
+                    print("Fetching only new repositories while keeping existing ones...")
+                else:
+                    print("Repository data already exists and force_refresh is false.")
+                    if is_test:
+                        if 'star_count' in existing_df.columns:
+                            print(f"Applying test mode (sort by stars, limit {limit}) to existing data.")
+                            sorted_df = existing_df.sort_values(by='star_count', ascending=False)
+                            return sorted_df.head(limit)
+                        else:
+                            print(f"Warning: 'star_count' not in existing data. Using first {limit} entries for test mode.")
+                            return existing_df.head(limit)
+                    return existing_df # Not test mode, return all existing
+
+        # If here, either force_refresh is true or data doesn't exist.
+        print("Fetching repositories from OSO...")
+        # Pass sort_by_stars only if in test_mode, limit is passed anyway (None if not test)
+        new_repos_df = self.fetcher.fetch_repositories(limit=limit, sort_by_stars=sort_by_stars_in_test)
+        
+        if new_repos_df.empty:
+            print("No repositories found from OSO fetch.")
+            # Save an empty DataFrame to indicate the step ran
+            self.data_manager.save_repos_data(pd.DataFrame())
+            return pd.DataFrame()
+            
+        print(f"Found {len(new_repos_df)} repositories from OSO.")
+
+        if fetch_new_only and not existing_df.empty:
+            # Filter out repositories that already exist
+            existing_repos = set(zip(existing_df['repo_artifact_namespace'], existing_df['repo_artifact_name']))
+            new_repos_df = new_repos_df[~new_repos_df.apply(
+                lambda x: (x['repo_artifact_namespace'], x['repo_artifact_name']) in existing_repos, 
+                axis=1
+            )]
+            print(f"Found {len(new_repos_df)} new repositories to process.")
+
+        if new_repos_df.empty:
+            print("No new repositories to process.")
+            return existing_df
+
+        print("Fetching READMEs from GitHub...")
+        # Ensure 'repo_artifact_namespace' and 'repo_artifact_name' exist
+        if 'repo_artifact_namespace' not in new_repos_df.columns or 'repo_artifact_name' not in new_repos_df.columns:
+            print("Error: 'repo_artifact_namespace' or 'repo_artifact_name' not in fetched data.")
+            # Save what we have so far
+            self.data_manager.save_repos_data(new_repos_df)
+            return new_repos_df # Or handle error more gracefully
+
+        new_repos_df = self.fetcher.get_all_readmes(new_repos_df)
+        print(f"Retrieved READMEs for {len(new_repos_df[new_repos_df['readme_md'] != ''])} repositories.")
+
+        # Combine existing and new data
+        if not existing_df.empty:
+            combined_df = pd.concat([existing_df, new_repos_df], ignore_index=True)
+            # Remove any duplicates that might have been introduced
+            combined_df = combined_df.drop_duplicates(
+                subset=['repo_artifact_namespace', 'repo_artifact_name'],
+                keep='first'
+            )
+            print(f"Combined data now contains {len(combined_df)} repositories.")
+            self.data_manager.save_repos_data(combined_df)
+            
+            # If in test mode and combined data exceeds limit
+            if limit is not None and len(combined_df) > limit:
+                if 'star_count' in combined_df.columns:
+                    return combined_df.sort_values(by='star_count', ascending=False).head(limit)
+                return combined_df.head(limit)
+            return combined_df
+        else:
+            self.data_manager.save_repos_data(new_repos_df)
+            # If in test mode and fetched more than limit
+            if limit is not None and len(new_repos_df) > limit:
+                return new_repos_df.head(limit)
+            return new_repos_df
+
+if __name__ == '__main__':
+    # Example Usage (requires .env file and OSO/GitHub credentials)
+    # Ensure pipeline_config.json exists or is created with defaults
+    cfg_manager = ConfigManager()
+        
+    output_dir = cfg_manager.get_output_dir()
+    dt_manager = DataManager(output_dir=output_dir, config=cfg_manager)
+    
+    repo_fetch_step = RepositoryFetcherStep(data_manager=dt_manager, config_manager=cfg_manager)
+    
+    print("\nRunning RepositoryFetcherStep...")
+    fetched_data = repo_fetch_step.run(force_refresh=False) # Set True to wipe and refetch
+    print(f"\nFetched data head:\n{fetched_data.head()}")
+    print(f"Number of rows fetched: {len(fetched_data)}")
diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/summary_generator.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/summary_generator.py
new file mode 100644
index 00000000..f09b0d78
--- /dev/null
+++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/summary_generator.py
@@ -0,0 +1,131 @@
+import pandas as pd
+from tqdm import tqdm
+from .data_manager import DataManager
+from ..config.config_manager import ConfigManager
+from ..processing.ai_service import AIService, SummaryOutput
+
+class SummaryGeneratorStep:
+    def __init__(self, data_manager: DataManager, config_manager: ConfigManager, ai_service: AIService):
+        self.data_manager = data_manager
+        self.config_manager = config_manager
+        self.ai_service = ai_service
+
+    def run(self, force_refresh: bool = False, new_only: bool = False):
+        """
+        Generate summaries for repositories.
+        Uses batch_size_summaries from config.
+        
+        Args:
+            force_refresh: If True, wipe existing summaries and regenerate all
+            new_only: If True, only generate summaries for repositories that don't have them yet
+        """
+        batch_size = self.config_manager.get_batch_size_summaries()
+        
+        if force_refresh:
+            print("Force refresh enabled for summaries. Wiping existing summarized data.")
+            self.data_manager.wipe_summaries_data()
+            existing_summaries_df = pd.DataFrame()
+        else:
+            existing_summaries_df = self.data_manager.get_summaries_data()
+
+        # Get repository data
+        repos_df = self.data_manager.get_repos_data()
+        if repos_df.empty:
+            print("No repository data found to generate summaries. Skipping.")
+            # Save an empty DataFrame to indicate the step ran if forced
+            if force_refresh or not self.data_manager.summarized_parquet_path.exists():
+                 self.data_manager.save_summaries_data(pd.DataFrame())
+            return pd.DataFrame()
+
+        # If we have existing summaries and not forcing refresh
+        if not existing_summaries_df.empty and not force_refresh:
+            if new_only:
+                # Filter out repositories that already have summaries
+                existing_repos = set(existing_summaries_df['repo_artifact_id'])
+                repos_to_process = repos_df[~repos_df['repo_artifact_id'].isin(existing_repos)]
+                if repos_to_process.empty:
+                    print("No new repositories found to generate summaries for.")
+                    return existing_summaries_df
+                print(f"Found {len(repos_to_process)} new repositories to generate summaries for.")
+            else:
+                print("Summarized data already exists and force_refresh is false. Skipping summary generation.")
+                return existing_summaries_df
+        else:
+            repos_to_process = repos_df
+
+        # Ensure 'readme_md' and 'repo_artifact_id' columns exist
+        if 'readme_md' not in repos_to_process.columns:
+            print("Error: 'readme_md' column not found in repository data. Cannot generate summaries.")
+            return pd.DataFrame()
+        if 'repo_artifact_id' not in repos_to_process.columns:
+            print("Error: 'repo_artifact_id' column not found. This ID is crucial.")
+            return pd.DataFrame()
+
+        print(f"Generating summaries for {len(repos_to_process)} repositories in batches of {batch_size}...")
+        
+        all_summaries_data = [] # To collect all rows with new summaries
+
+        # Process in batches
+        for start_idx in tqdm(range(0, len(repos_to_process), batch_size), desc="Generating Summaries"):
+            end_idx = min(start_idx + batch_size, len(repos_to_process))
+            batch_df_initial = repos_to_process.iloc[start_idx:end_idx]
+            
+            # Create a working copy for this batch to add summaries
+            batch_df_processed = batch_df_initial.copy()
+
+            summaries = []
+            for _, row in batch_df_initial.iterrows():
+                readme_content = row.get('readme_md', "")
+                summary_output: SummaryOutput = self.ai_service.make_summary(readme_content)
+                summaries.append(summary_output.summary)
+            
+            batch_df_processed["summary"] = summaries
+            all_summaries_data.append(batch_df_processed)
+        
+        if not all_summaries_data:
+            print("No summaries were generated.")
+            # Save an empty DataFrame if no summaries were made but the step was intended to run
+            if force_refresh or not self.data_manager.summarized_parquet_path.exists():
+                self.data_manager.save_summaries_data(pd.DataFrame())
+            return pd.DataFrame()
+
+        new_summaries_df = pd.concat(all_summaries_data, ignore_index=True)
+        
+        # If we have existing summaries and not forcing refresh, combine with new ones
+        if not existing_summaries_df.empty and not force_refresh:
+            final_summarized_df = pd.concat([existing_summaries_df, new_summaries_df], ignore_index=True)
+            # Remove any duplicates that might have been introduced
+            final_summarized_df = final_summarized_df.drop_duplicates(
+                subset=['repo_artifact_id'],
+                keep='last'  # Keep the new summary if there was a duplicate
+            )
+            print(f"Combined data now contains {len(final_summarized_df)} repositories with summaries.")
+        else:
+            final_summarized_df = new_summaries_df
+            
+        self.data_manager.save_summaries_data(final_summarized_df)
+        
+        return final_summarized_df
+
+if __name__ == '__main__':
+    # Example Usage
+    cfg_manager = ConfigManager()
+    ai_svc = AIService(config_manager=cfg_manager)
+    output_dir = cfg_manager.get_output_dir()
+    dt_manager = DataManager(output_dir=output_dir, config=cfg_manager)
+    
+    # Ensure repo data exists (run repo_fetcher.py example first if needed)
+    if dt_manager.get_repos_data().empty:
+        print("No repository data found. Please run RepositoryFetcherStep first or ensure data exists.")
+    else:
+        summary_gen_step = SummaryGeneratorStep(
+            data_manager=dt_manager, 
+            config_manager=cfg_manager, 
+            ai_service=ai_svc
+        )
+        
+        print("\nRunning SummaryGeneratorStep...")
+        # Set force_refresh=True to regenerate even if file exists
+        summarized_data = summary_gen_step.run(force_refresh=False) 
+        print(f"\nSummarized data head:\n{summarized_data.head()}")
+        print(f"Number of rows with summaries: {len(summarized_data)}")
diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/unified_processor.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/unified_processor.py
new file mode 100644
index 00000000..d669fa76
--- /dev/null
+++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/unified_processor.py
@@ -0,0 +1,432 @@
+import pandas as pd
+import datetime
+import json
+import time
+from typing import List, Dict, Any, Optional, Set
+from tqdm import tqdm
+from .data_manager import DataManager
+from ..config.config_manager import ConfigManager
+from ..processing.ai_service import AIService, SummaryOutput, ClassificationOutput
+from ..processing.fetcher import DataFetcher
+
+class UnifiedProcessor:
+    def __init__(self, data_manager: DataManager, config_manager: ConfigManager, ai_service: AIService):
+        self.data_manager = data_manager
+        self.config_manager = config_manager
+        self.ai_service = ai_service
+        self.fetcher = DataFetcher()
+        
+    def run(self, 
+            force_refresh: bool = False, 
+            include_forks: bool = False, 
+            inactive_repos: bool = False,
+            limit: Optional[int] = None):
+        """
+        Unified processing pipeline that fetches repositories, READMEs, generates summaries,
+        and categorizes them in a single pass.
+        
+        Args:
+            force_refresh: If True, wipe existing data and process everything fresh
+            include_forks: If True, include forked repositories in processing
+            inactive_repos: If True, include repositories not updated in the last year
+            limit: Optional limit on number of repositories to process
+        """
+        # Get test mode settings
+        is_test = self.config_manager.is_test_mode()
+        if is_test:
+            test_limit = self.config_manager.get_test_mode_limit()
+            if limit is None or limit > test_limit:
+                limit = test_limit
+            print(f"Running in TEST MODE: Limiting to {limit} repositories, sorted by stars DESC.")
+        
+        # Determine batch sizes
+        batch_size = min(
+            self.config_manager.get_batch_size_summaries(),
+            self.config_manager.get_batch_size_categorization()
+        )
+        
+        # Load checkpoint or initialize a new one
+        if force_refresh:
+            print("Force refresh enabled. Wiping existing data and checkpoint.")
+            self.data_manager.wipe_unified_data()
+            self._initialize_checkpoint()
+            existing_df = pd.DataFrame()
+        else:
+            existing_df = self.data_manager.get_unified_data()
+            if not existing_df.empty:
+                print(f"Found existing data with {len(existing_df)} repositories.")
+        
+        # Fetch repositories from OSO
+        print("Fetching repositories from OSO...")
+        repos_df = self.fetcher.fetch_repositories(limit=limit, sort_by_stars=True)
+        
+        if repos_df.empty:
+            print("No repositories found from OSO fetch.")
+            return pd.DataFrame()
+        
+        print(f"Found {len(repos_df)} repositories from OSO.")
+        
+        # Filter repositories based on parameters
+        if not include_forks:
+            repos_df = repos_df[~repos_df['is_fork']]
+            print(f"Filtered out forks. {len(repos_df)} repositories remaining.")
+        
+        if not inactive_repos:
+            repos_df = repos_df[repos_df['is_actively_maintained']]
+            print(f"Filtered out inactive repositories. {len(repos_df)} repositories remaining.")
+        
+        # Load checkpoint to determine which repositories need processing
+        checkpoint = self.data_manager.load_checkpoint()
+        processed_repos = set(checkpoint.get("processed_repos", []))
+        
+        # Determine which repositories need processing
+        if not force_refresh:
+            # Filter out already processed repositories
+            repos_to_process = repos_df[~repos_df['repo_artifact_id'].isin(processed_repos)]
+            print(f"Found {len(repos_to_process)} repositories that need processing.")
+            
+            # Process the repositories
+            processed_df = self._process_repositories(repos_to_process, batch_size)
+            
+            # Return the combined data (existing + newly processed)
+            return self.data_manager.get_unified_data()
+        else:
+            # Process all repositories
+            processed_df = self._process_repositories(repos_df, batch_size)
+            return self.data_manager.get_unified_data()
+    
+    def _initialize_checkpoint(self):
+        """Initialize a new checkpoint file"""
+        checkpoint = {
+            "last_processed_repo_id": None,
+            "processed_repos": [],
+            "partial_results": {}
+        }
+        self.data_manager.save_checkpoint(checkpoint)
+        print("Initialized new processing checkpoint.")
+        
+    def _process_repositories(self, repos_df: pd.DataFrame, batch_size: int) -> pd.DataFrame:
+        """
+        Process repositories in batches: fetch READMEs, generate summaries, and categorize.
+        
+        Args:
+            repos_df: DataFrame containing repositories to process
+            batch_size: Number of repositories to process in each batch
+            
+        Returns:
+            DataFrame with processed repositories
+        """
+        print(f"Processing {len(repos_df)} repositories in batches of {batch_size}...")
+        
+        # Get personas for categorization
+        personas = self.config_manager.get_personas()
+        if not personas:
+            print("No personas found for categorization.")
+            return repos_df
+        
+        # Load checkpoint
+        checkpoint = self.data_manager.load_checkpoint()
+        processed_repos = set(checkpoint.get("processed_repos", []))
+        partial_results = checkpoint.get("partial_results", {})
+        
+        # Process in batches
+        all_processed_data = []
+        
+        for start_idx in tqdm(range(0, len(repos_df), batch_size), desc="Processing Repositories"):
+            end_idx = min(start_idx + batch_size, len(repos_df))
+            batch_df = repos_df.iloc[start_idx:end_idx].copy()
+            
+            # Process each repository in the batch
+            for idx, row in tqdm(batch_df.iterrows(), desc="Processing repositories in batch", total=len(batch_df), leave=False):
+                repo_id = row.get('repo_artifact_id')
+                repo_name = row.get('repo_artifact_name', 'repo')
+                
+                # Skip if already fully processed
+                if repo_id in processed_repos:
+                    print(f"Skipping {repo_name} (already processed)")
+                    continue
+                
+                # Get partial progress for this repository
+                partial = partial_results.get(repo_id, {})
+                
+                # Initialize repository data
+                repo_data = row.to_dict()
+                repo_data['categorizations'] = []
+                repo_data['final_recommendation'] = 'UNCATEGORIZED'
+                repo_data['processing_timestamp'] = datetime.datetime.now().isoformat()
+                repo_data['summary'] = ''
+                
+                # Fetch README if needed
+                if not partial.get('readme_fetched', False):
+                    try:
+                        print(f"Fetching README for {repo_name}...")
+                        readme_content, readme_status = self.fetcher.fetch_readme(
+                            repo_data['repo_artifact_namespace'],
+                            repo_data['repo_artifact_name']
+                        )
+                        repo_data['readme_md'] = readme_content
+                        repo_data['readme_status'] = readme_status
+                        
+                        # Update checkpoint
+                        partial['readme_fetched'] = True
+                        partial['readme_status'] = repo_data['readme_status']
+                        partial_results[repo_id] = partial
+                        checkpoint['partial_results'] = partial_results
+                        self.data_manager.save_checkpoint(checkpoint)
+                    except Exception as e:
+                        print(f"Error fetching README for {repo_name}: {e}")
+                        repo_data['readme_md'] = ''
+                        repo_data['readme_status'] = 'ERROR'
+                        
+                        # Update checkpoint
+                        partial['readme_fetched'] = True
+                        partial['readme_status'] = 'ERROR'
+                        partial_results[repo_id] = partial
+                        checkpoint['partial_results'] = partial_results
+                        self.data_manager.save_checkpoint(checkpoint)
+                else:
+                    # Use README status from checkpoint
+                    repo_data['readme_status'] = partial.get('readme_status', 'ERROR')
+                
+                # Generate summary if needed
+                if not partial.get('summary_generated', False) and repo_data['readme_status'] == 'SUCCESS':
+                    try:
+                        print(f"Generating summary for {repo_name}...")
+                        readme_content = repo_data.get('readme_md', '')
+                        summary_output: SummaryOutput = self.ai_service.make_summary(readme_content)
+                        repo_data['summary'] = summary_output.summary
+                        
+                        # Update checkpoint
+                        partial['summary_generated'] = True
+                        partial['summary'] = summary_output.summary
+                        partial_results[repo_id] = partial
+                        checkpoint['partial_results'] = partial_results
+                        self.data_manager.save_checkpoint(checkpoint)
+                    except Exception as e:
+                        print(f"Error generating summary for {repo_name}: {e}")
+                        repo_data['summary'] = ''
+                        
+                        # Update checkpoint
+                        partial['summary_generated'] = True  # Mark as attempted
+                        partial_results[repo_id] = partial
+                        checkpoint['partial_results'] = partial_results
+                        self.data_manager.save_checkpoint(checkpoint)
+                elif partial.get('summary_generated', False) and 'summary' in partial:
+                    # Use summary from checkpoint
+                    repo_data['summary'] = partial.get('summary', '')
+                
+                # Initialize personas completed
+                if 'personas_completed' not in partial:
+                    partial['personas_completed'] = []
+                
+                # Initialize categorizations
+                categorizations = []
+                
+                # Categorize with each persona if README is available
+                if repo_data['readme_status'] == 'SUCCESS':
+                    for persona in tqdm(personas, desc=f"Categorizing {repo_name} with personas", leave=False):
+                        # Skip if already categorized by this persona
+                        if persona['name'] in partial.get('personas_completed', []):
+                            # Use existing categorization from checkpoint
+                            if 'categorizations' in partial:
+                                for cat in partial['categorizations']:
+                                    if cat['persona_name'] == persona['name']:
+                                        categorizations.append(cat)
+                                        break
+                            continue
+                        
+                        try:
+                            # Prepare project data for categorization
+                            project_data = {
+                                'summary': repo_data['summary'],
+                                'repo_artifact_id': repo_id,
+                                'star_count': repo_data.get('star_count', 0),
+                                'fork_count': repo_data.get('fork_count', 0),
+                                'created_at': repo_data.get('created_at'),
+                                'updated_at': repo_data.get('updated_at')
+                            }
+                            
+                            # Get categorization from this persona
+                            classifications = self.ai_service.classify_projects_batch_for_persona(
+                                [project_data],
+                                persona
+                            )
+                            
+                            if classifications and len(classifications) > 0:
+                                classification = classifications[0]
+                                cat_entry = {
+                                    'persona_name': persona['name'],
+                                    'category': classification.assigned_tag,
+                                    'reason': classification.reason,
+                                    'timestamp': datetime.datetime.now().isoformat()
+                                }
+                                categorizations.append(cat_entry)
+                                
+                                # Update checkpoint
+                                if 'categorizations' not in partial:
+                                    partial['categorizations'] = []
+                                partial['categorizations'].append(cat_entry)
+                                partial['personas_completed'].append(persona['name'])
+                                partial_results[repo_id] = partial
+                                checkpoint['partial_results'] = partial_results
+                                self.data_manager.save_checkpoint(checkpoint)
+                            else:
+                                cat_entry = {
+                                    'persona_name': persona['name'],
+                                    'category': 'UNCATEGORIZED',
+                                    'reason': 'Failed to get classification from AI service',
+                                    'timestamp': datetime.datetime.now().isoformat()
+                                }
+                                categorizations.append(cat_entry)
+                                
+                                # Update checkpoint
+                                if 'categorizations' not in partial:
+                                    partial['categorizations'] = []
+                                partial['categorizations'].append(cat_entry)
+                                partial['personas_completed'].append(persona['name'])
+                                partial_results[repo_id] = partial
+                                checkpoint['partial_results'] = partial_results
+                                self.data_manager.save_checkpoint(checkpoint)
+                        except Exception as e:
+                            print(f"Error categorizing {repo_name} with persona {persona['name']}: {e}")
+                            cat_entry = {
+                                'persona_name': persona['name'],
+                                'category': 'UNCATEGORIZED',
+                                'reason': f'Error: {str(e)}',
+                                'timestamp': datetime.datetime.now().isoformat()
+                            }
+                            categorizations.append(cat_entry)
+                            
+                            # Update checkpoint
+                            if 'categorizations' not in partial:
+                                partial['categorizations'] = []
+                            partial['categorizations'].append(cat_entry)
+                            partial['personas_completed'].append(persona['name'])
+                            partial_results[repo_id] = partial
+                            checkpoint['partial_results'] = partial_results
+                            self.data_manager.save_checkpoint(checkpoint)
+                            
+                        # Add a small delay to avoid rate limiting
+                        time.sleep(0.1)
+                else:
+                    # If README is empty or error, mark all categorizations as UNCATEGORIZED
+                    for persona in tqdm(personas, desc=f"Marking {repo_name} as UNCATEGORIZED", leave=False):
+                        # Skip if already categorized by this persona
+                        if persona['name'] in partial.get('personas_completed', []):
+                            # Use existing categorization from checkpoint
+                            if 'categorizations' in partial:
+                                for cat in partial['categorizations']:
+                                    if cat['persona_name'] == persona['name']:
+                                        categorizations.append(cat)
+                                        break
+                            continue
+                            
+                        cat_entry = {
+                            'persona_name': persona['name'],
+                            'category': 'UNCATEGORIZED',
+                            'reason': f'README {repo_data["readme_status"]}',
+                            'timestamp': datetime.datetime.now().isoformat()
+                        }
+                        categorizations.append(cat_entry)
+                        
+                        # Update checkpoint
+                        if 'categorizations' not in partial:
+                            partial['categorizations'] = []
+                        partial['categorizations'].append(cat_entry)
+                        partial['personas_completed'].append(persona['name'])
+                        partial_results[repo_id] = partial
+                        checkpoint['partial_results'] = partial_results
+                        self.data_manager.save_checkpoint(checkpoint)
+                
+                # Determine final recommendation based on categorizations
+                final_recommendation = self._determine_final_recommendation(categorizations, repo_data.get('star_count', 0))
+                
+                # Update the repository data
+                repo_data['categorizations'] = categorizations
+                repo_data['final_recommendation'] = final_recommendation
+                repo_data['processing_timestamp'] = datetime.datetime.now().isoformat()
+                
+                # Create a DataFrame for this repository
+                repo_df = pd.DataFrame([repo_data])
+                
+                # Save this repository to the unified data
+                self.data_manager.append_unified_data(repo_df)
+                
+                # Mark as fully processed
+                processed_repos.add(repo_id)
+                checkpoint['processed_repos'] = list(processed_repos)
+                checkpoint['last_processed_repo_id'] = repo_id
+                
+                # Remove from partial results to save space
+                if repo_id in partial_results:
+                    del partial_results[repo_id]
+                    
+                checkpoint['partial_results'] = partial_results
+                self.data_manager.save_checkpoint(checkpoint)
+                
+                # Add to processed data
+                all_processed_data.append(repo_df)
+        
+        if not all_processed_data:
+            print("No data was processed.")
+            return pd.DataFrame()
+        
+        return pd.concat(all_processed_data, ignore_index=True) if all_processed_data else pd.DataFrame()
+    
+    def _determine_final_recommendation(self, categorizations: List[Dict[str, Any]], star_count: int) -> str:
+        """
+        Determine the final recommendation based on categorizations from all personas.
+        
+        Args:
+            categorizations: List of categorization dictionaries
+            star_count: Star count of the repository (for potential future weighting)
+            
+        Returns:
+            Final category recommendation
+        """
+        # Filter out UNCATEGORIZED entries
+        valid_categories = [c['category'] for c in categorizations if c['category'] != 'UNCATEGORIZED']
+        
+        if not valid_categories:
+            return 'UNCATEGORIZED'
+        
+        # Count occurrences of each category
+        category_counts = {}
+        for category in valid_categories:
+            category_counts[category] = category_counts.get(category, 0) + 1
+        
+        # Find the most common category
+        max_count = 0
+        final_category = 'UNCATEGORIZED'
+        
+        for category, count in category_counts.items():
+            if count > max_count:
+                max_count = count
+                final_category = category
+        
+        return final_category
+
+
+if __name__ == '__main__':
+    # Example Usage
+    cfg_manager = ConfigManager()
+    ai_svc = AIService(config_manager=cfg_manager)
+    output_dir = cfg_manager.get_output_dir()
+    dt_manager = DataManager(output_dir=output_dir, config=cfg_manager)
+    
+    processor = UnifiedProcessor(
+        data_manager=dt_manager,
+        config_manager=cfg_manager,
+        ai_service=ai_svc
+    )
+    
+    print("\nRunning UnifiedProcessor...")
+    processed_data = processor.run(
+        force_refresh=False,
+        include_forks=False,
+        inactive_repos=False
+    )
+    
+    if not processed_data.empty:
+        print(f"\nProcessed data head:\n{processed_data.head()}")
+        print(f"Number of rows processed: {len(processed_data)}")
diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/__init__.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/__init__.py
new file mode 100644
index 00000000..e1ebbcd8
--- /dev/null
+++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/__init__.py
@@ -0,0 +1,10 @@
+from .fetcher import DataFetcher
+from .ai_service import AIService
+
+# The old Summarizer class has been effectively replaced by AIService
+# and the pipeline steps (SummaryGeneratorStep, CategorizerStep).
+
+__all__ = [
+    "DataFetcher",
+    "AIService",
+]
diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/ai_service.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/ai_service.py
new file mode 100644
index 00000000..074767dd
--- /dev/null
+++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/ai_service.py
@@ -0,0 +1,263 @@
+import json
+import pandas as pd
+import time
+from dataclasses import dataclass, asdict
+from typing import List, Type, TypeVar, Union, Dict, Any
+import google.generativeai as genai
+from ..config.config_manager import ConfigManager
+
+# Define generic type for output classes
+T = TypeVar(
+    'T',
+    bound=Union['SummaryOutput', 'TagsOutput', 'ClassificationOutput', 'BatchClassificationOutput']
+)
+
+@dataclass
+class SummaryOutput:
+    summary: str
+
+@dataclass
+class TagsOutput:
+    tags: List[str]
+
+@dataclass
+class ClassificationOutput:
+    assigned_tag: str
+    reason: str
+
+@dataclass
+class BatchClassificationOutput:
+    classifications: List[ClassificationOutput]
+
+
+class AIService:
+    def __init__(self, config_manager: ConfigManager):
+        self.config_manager = config_manager
+        self.api_key = self.config_manager.get_gemini_api_key() # Use specific getter
+        self.model_name = self.config_manager.get("gemini_model") # Model name can stay in JSON config
+        
+        if not self.api_key:
+            raise ValueError("GEMINI_API_KEY not found in configuration.")
+        if not self.model_name:
+            raise ValueError("GEMINI_MODEL not found in configuration.")
+            
+        genai.configure(api_key=self.api_key)
+        self.model = genai.GenerativeModel(self.model_name)
+        self.request_count = 0
+        self.start_time = time.time()
+
+    def _rate_limit_control(self):
+        """Basic rate limiting: 60 requests per minute for flash models."""
+        self.request_count += 1
+        elapsed_time = time.time() - self.start_time
+        if elapsed_time < 60 and self.request_count > 55: # Slight safety margin
+            sleep_time = 60 - elapsed_time
+            print(f"Rate limit approaching. Sleeping for {sleep_time:.2f} seconds.")
+            time.sleep(sleep_time)
+            self.request_count = 0
+            self.start_time = time.time()
+        elif elapsed_time >= 60:
+            self.request_count = 0
+            self.start_time = time.time()
+
+
+    def execute_query(self, prompt: str, output_class: Type[T]) -> T:
+        """Execute a query against the Gemini API and parse the response."""
+        self._rate_limit_control()
+        print(f"\nSending prompt to Gemini (model: {self.model_name})...")
+
+        try:
+            response = self.model.generate_content(prompt)
+        except Exception as e:
+            print(f"Error calling Gemini API: {e}")
+            # Fallback for errors
+            if output_class is SummaryOutput:
+                return SummaryOutput(summary="Error generating summary.")
+            if output_class is TagsOutput:
+                return TagsOutput(tags=[])
+            if output_class is ClassificationOutput:
+                return ClassificationOutput(assigned_tag="Error", reason="API call failed.")
+            if output_class is BatchClassificationOutput:
+                return BatchClassificationOutput(classifications=[])
+            raise
+
+        try:
+            text = response.text.strip()
+            # Try to find JSON block, robustly
+            json_str = None
+            if output_class is BatchClassificationOutput: # Expects a list
+                 start_brace = text.find("[")
+                 end_brace = text.rfind("]") + 1 # Add 1 to include the closing bracket
+            else: # Expects an object
+                start_brace = text.find("{")
+                end_brace = text.rfind("}") + 1 # Add 1 to include the closing brace
+
+            if start_brace != -1 and end_brace > start_brace:
+                json_str = text[start_brace:end_brace]
+                data = json.loads(json_str)
+            else:
+                print("No valid JSON found in response.")
+                raise ValueError("No JSON object/array found in response")
+
+            if output_class is SummaryOutput:
+                return SummaryOutput(summary=data.get("summary", "Summary not found in response."))
+            if output_class is TagsOutput:
+                return TagsOutput(tags=data.get("tags", []))
+            if output_class is ClassificationOutput: # For single classification
+                return ClassificationOutput(
+                    assigned_tag=data.get("assigned_tag", "Other"),
+                    reason=data.get("reason", "Could not classify project from response.")
+                )
+            if output_class is BatchClassificationOutput: # For batch classification
+                classifications_data = data # data is already the list
+                parsed_classifications = [
+                    ClassificationOutput(
+                        assigned_tag=item.get("assigned_tag", "Other"),
+                        reason=item.get("reason", "Could not classify.")
+                    ) for item in classifications_data
+                ]
+                return BatchClassificationOutput(classifications=parsed_classifications)
+
+            raise ValueError(f"Unknown output class: {output_class}")
+
+        except (json.JSONDecodeError, ValueError) as e:
+            print(f"Error processing Gemini response: {e}. Raw text: '{response.text[:300]}...'")
+            if output_class is SummaryOutput:
+                return SummaryOutput(summary="Failed to parse summary from response.")
+            if output_class is TagsOutput:
+                return TagsOutput(tags=[])
+            if output_class is ClassificationOutput:
+                return ClassificationOutput(assigned_tag="Other", reason="Failed to parse classification.")
+            if output_class is BatchClassificationOutput:
+                 # Return empty list of classifications for the batch
+                return BatchClassificationOutput(classifications=[])
+            raise
+
+    def make_summary(self, readme_md: str) -> SummaryOutput:
+        """Generate a summary of the project based on its README."""
+        if not readme_md or not readme_md.strip():
+            return SummaryOutput(summary="This appears to be an empty repository without a README file.")
+        
+        prompt_template = self.config_manager.get_summary_prompt_template()
+        prompt = prompt_template.format(readme_md=readme_md)
+        return self.execute_query(prompt, SummaryOutput)
+
+    def make_tags(self, summary: str) -> TagsOutput:
+        """Generate tags for the project based on its summary."""
+        if not summary or "empty repository" in summary.lower() or "error generating summary" in summary.lower():
+            return TagsOutput(tags=[])
+
+        prompt_template = self.config_manager.get_tags_prompt_template()
+        prompt = prompt_template.format(summary=summary)
+        return self.execute_query(prompt, TagsOutput)
+
+    def classify_projects_batch_for_persona(
+        self,
+        project_data_batch: List[Dict[str, Any]], # Changed from summaries: List[str]
+        persona: Dict[str, Any]
+    ) -> List[ClassificationOutput]:
+        """
+        Classify multiple projects at once for a specific persona using their summaries and metadata.
+        Each item in project_data_batch is a dict with 'summary', 'star_count', etc.
+        The persona dictionary should contain 'name', 'title', 'description', and 'prompt' (template).
+        """
+        if not project_data_batch:
+            return []
+
+        categories_list_str = "\n".join(
+            f"- \"{c['category']}\": {c['description']}" # Ensure category names are quoted for clarity in prompt
+            for c in self.config_manager.get_categories()
+        )
+
+        persona_prompt_template = persona.get('prompt')
+        if not persona_prompt_template:
+            print(f"Error: Persona '{persona.get('name')}' is missing a prompt template.")
+            return [ClassificationOutput(assigned_tag="Error", reason="Persona prompt missing")] * len(project_data_batch)
+
+        individual_project_prompts = []
+        for i, project_data in enumerate(project_data_batch):
+            # Prepare metadata for formatting, handling None or NaN
+            # Ensure star_count and fork_count are numbers, default to 0 if None/NaN
+            star_count = project_data.get('star_count')
+            fork_count = project_data.get('fork_count')
+            
+            formatted_star_count = int(star_count) if pd.notna(star_count) else 0
+            formatted_fork_count = int(fork_count) if pd.notna(fork_count) else 0
+            
+            # Format dates, default to "N/A" if None/NaT
+            created_at = project_data.get('created_at')
+            updated_at = project_data.get('updated_at')
+
+            formatted_created_at = str(created_at.date()) if pd.notna(created_at) and hasattr(created_at, 'date') else "N/A"
+            formatted_updated_at = str(updated_at.date()) if pd.notna(updated_at) and hasattr(updated_at, 'date') else "N/A"
+            
+            # Ensure summary is a string
+            summary_text = project_data.get('summary', "No summary provided.")
+            if not isinstance(summary_text, str):
+                summary_text = str(summary_text)
+
+
+            try:
+                # The persona_prompt_template itself contains the persona's role description.
+                # We just need to format it with the project-specific data.
+                # The {categories} placeholder in the persona prompt will be filled by this categories_list_str.
+                formatted_project_section = persona_prompt_template.format(
+                    summary=summary_text,
+                    star_count=formatted_star_count,
+                    fork_count=formatted_fork_count,
+                    created_at=formatted_created_at,
+                    updated_at=formatted_updated_at,
+                    categories=categories_list_str # Pass the formatted list of categories
+                )
+                individual_project_prompts.append(f"--- Project {i+1} ---\n{formatted_project_section}")
+            except KeyError as e:
+                print(f"KeyError during prompt formatting for persona {persona.get('name')}, project {project_data.get('repo_artifact_id', 'Unknown')}: {e}")
+                # Add a placeholder error entry for this project
+                individual_project_prompts.append(f"--- Project {i+1} ---\nError formatting prompt for this project. Cannot classify.")
+
+
+        batch_project_details_str = "\n\n".join(individual_project_prompts)
+
+        # Construct the overall batch prompt
+        # The persona's title and description can frame the overall task.
+        persona_title = persona.get('title', persona['name'])
+        persona_description = persona.get('description', '')
+
+        final_batch_prompt = f"""As {persona_title} ({persona_description}), your task is to review and classify the following {len(project_data_batch)} project(s).
+For each project, use the specific instructions and context provided under its section.
+
+{batch_project_details_str}
+
+After reviewing all projects, please respond with a single JSON array. Each element in the array should be a JSON object corresponding to one project, in the exact order they were presented above. Each object must contain:
+1. "assigned_tag": The category you assigned from the provided list.
+2. "reason": A brief explanation for your choice, following the persona's specific instructions.
+
+Example for two projects:
+[
+  {{ "assigned_tag": "Category A", "reason": "Reason for project 1..." }},
+  {{ "assigned_tag": "Category B", "reason": "Reason for project 2..." }}
+]
+"""
+        
+        batch_output = self.execute_query(final_batch_prompt, BatchClassificationOutput)
+        
+        # Ensure the number of classifications matches the number of projects
+        if len(batch_output.classifications) != len(project_data_batch):
+            print(f"Warning: Mismatch in number of projects ({len(project_data_batch)}) and classifications ({len(batch_output.classifications)}) for persona {persona['name']}.")
+            error_classification = ClassificationOutput(assigned_tag="Error", reason="Mismatch in batch processing output length")
+            # Adjust the length of classifications to match project_data_batch
+            final_classifications = batch_output.classifications[:len(project_data_batch)]
+            while len(final_classifications) < len(project_data_batch):
+                final_classifications.append(error_classification)
+            batch_output.classifications = final_classifications
+            
+        return batch_output.classifications
+
+
+if __name__ == '__main__':
+    # Example Usage
+    # Example Usage:
+    # cfg_manager = ConfigManager() 
+    # ai_service = AIService(config_manager=cfg_manager)
+    # print("AIService initialized for standalone testing if needed.")
+    pass
diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/fetcher.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/fetcher.py
new file mode 100644
index 00000000..36bf25ac
--- /dev/null
+++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/fetcher.py
@@ -0,0 +1,140 @@
+import base64
+import requests
+import pandas as pd
+import datetime
+from pyoso import Client
+from ..config.settings import OSO_API_KEY, GITHUB_HEADERS
+
+class DataFetcher:
+    def __init__(self):
+        self.oso_client = Client(api_key=OSO_API_KEY)
+
+    def fetch_repositories(self, limit: int = None, sort_by_stars: bool = True) -> pd.DataFrame:
+        """
+        Fetch repositories from OSO.
+        
+        Args:
+            limit: Optional limit on number of repositories to fetch.
+            sort_by_stars: If True, sort repositories by star_count descending.
+        """
+
+        where_keywords = """
+                collection_name LIKE '%ethereum%'
+                OR collection_name LIKE '%arbitrum%'
+                OR collection_name LIKE '%optimism%'
+                OR collection_name LIKE '%scroll%'
+                OR collection_name LIKE '%polygon%'
+        """
+        query = f"""
+        SELECT DISTINCT
+          re.artifact_id AS repo_artifact_id,
+          p.project_id,
+          p.project_name,
+          p.display_name,
+          re.artifact_namespace AS repo_artifact_namespace,
+          re.artifact_name AS repo_artifact_name,
+          re.created_at,
+          re.updated_at,
+          re.star_count,
+          re.fork_count,
+          re.is_fork,
+          re.num_packages_in_deps_dev        
+        FROM int_repositories_enriched AS re
+        JOIN projects_v1 AS p ON re.project_id = p.project_id
+        WHERE p.project_id IN (
+            SELECT DISTINCT project_id FROM oso.projects_by_collection_v1
+            WHERE {where_keywords}
+        )
+        """
+        # The table int_superchain_s7_devtooling_repositories should have star_count
+        # If not, this sort will fail or do nothing. Assuming 'r.star_count' is valid.
+        if sort_by_stars:
+            query += " ORDER BY re.star_count DESC, p.project_name ASC"
+
+        if limit is not None and isinstance(limit, int) and limit > 0:
+            query += f" LIMIT {limit}"
+            
+        df = self.oso_client.to_pandas(query)
+        
+        # Add is_actively_maintained field based on updated_at (active if updated in last year)
+        # Use naive datetime (no timezone) for comparison
+        one_year_ago = pd.Timestamp.now().tz_localize(None) - pd.Timedelta(days=365)
+        
+        # Convert updated_at to datetime if it's a string
+        def check_if_active(date):
+            if pd.isna(date):
+                return False
+            
+            # Convert to datetime if it's a string
+            if isinstance(date, str):
+                try:
+                    date = pd.to_datetime(date)
+                except:
+                    return False
+            
+            # Ensure datetime is naive (no timezone) for comparison
+            if hasattr(date, 'tz_localize'):
+                if date.tzinfo is not None:
+                    date = date.tz_localize(None)
+                    
+            # Now compare with one_year_ago
+            return date > one_year_ago
+            
+        df['is_actively_maintained'] = df['updated_at'].apply(check_if_active)
+        
+        # Ensure is_fork is a boolean
+        if 'is_fork' not in df.columns:
+            print("Warning: 'is_fork' field not available in OSO data. Setting all to False.")
+            df['is_fork'] = False
+        else:
+            # Convert to boolean if it's not already
+            df['is_fork'] = df['is_fork'].fillna(False).astype(bool)
+            
+        return df
+
+    def fetch_readme(self, owner: str, repo: str) -> tuple:
+        """
+        Fetch README.md content from GitHub repository with debug logging.
+        
+        Returns:
+            tuple: (readme_content, status) where status is one of:
+                   "SUCCESS", "EMPTY", or "ERROR"
+        """
+        url = f"https://api.github.com/repos/{owner}/{repo}/readme"
+        print(f"Fetching README for {owner}/{repo} ...", flush=True)
+        resp = requests.get(url, headers=GITHUB_HEADERS)
+        print(f"Status code: {resp.status_code}", flush=True)
+        if resp.status_code == 200:
+            data = resp.json()
+            try:
+                content = base64.b64decode(data["content"]).decode("utf-8")
+                if not content.strip():
+                    print(f"Empty README for {owner}/{repo}", flush=True)
+                    return "", "EMPTY"
+                print(f"Successfully fetched README for {owner}/{repo}", flush=True)
+                return content, "SUCCESS"
+            except Exception as e:
+                print(f"Error decoding README for {owner}/{repo}: {e}", flush=True)
+                return "", "ERROR"
+        else:
+            print(f"Failed to fetch README for {owner}/{repo}: {resp.text}", flush=True)
+        return "", "ERROR"
+
+    def get_all_readmes(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Add README content to the dataframe for each repository with debug logging."""
+        print("First 5 repo_artifact_namespace:", df["repo_artifact_namespace"].head().tolist(), flush=True)
+        print("First 5 repo_artifact_name:", df["repo_artifact_name"].head().tolist(), flush=True)
+        
+        # Apply fetch_readme and capture both content and status with progress bar
+        from tqdm import tqdm
+        tqdm.pandas(desc="Fetching READMEs")
+        readme_results = df.progress_apply(
+            lambda row: self.fetch_readme(row.repo_artifact_namespace, row.repo_artifact_name),
+            axis=1
+        )
+        
+        # Split the results into separate columns
+        df["readme_md"] = [result[0] for result in readme_results]
+        df["readme_status"] = [result[1] for result in readme_results]
+        
+        return df
diff --git a/experiments/ethereum-repo-clusters/pipeline_config.json b/experiments/ethereum-repo-clusters/pipeline_config.json
new file mode 100644
index 00000000..65dc68d3
--- /dev/null
+++ b/experiments/ethereum-repo-clusters/pipeline_config.json
@@ -0,0 +1,10 @@
+{
+    "output_dir": "/Users/cerv1/Dropbox/Kariba/Github/insights/experiments/devtooling_labels/output",
+    "gemini_model": "gemini-2.0-flash",
+    "summary_prompt_template": "You are an analyst preparing short, neutral briefs on open-source projects.  Read the README below and write a **concise, 2- to 3-sentence summary** that:\n\u2022 states the project\u2019s core purpose / problem it solves\n\u2022 lists its main capabilities or components (1\u20133 key points only)\n\u2022 mentions the primary intended users or systems (e.g., smart-contract developers, node operators)\n\u2022 notes any strongly signalled context such as supported programming language, network, or runtime\n\n**Style constraints**\n\u2022 Use plain, factual language in third person (no hype, no marketing adjectives).\n\u2022 **Do not** guess or invent details that are not explicit in the README.\n\u2022 **Do not** label the project with, or copy wording from, the taxonomy below (to avoid category leakage).\n\u2022 Limit the summary to <100 words; avoid bullet lists or line breaks.\n\nReturn your answer as **exactly one valid JSON object** in this form (nothing extra):\n{{\n  \"summary\": \"your summary here\"\n}}\n\nREADME:\n{readme_md}",
+    "tags_prompt_template": "Based on this project summary, generate a list of relevant tags that describe the project's purpose and functionality.\n\nYou must respond with a valid JSON object in this exact format:\n{{\n    \"tags\": [\"tag1\", \"tag2\", \"tag3\"]\n}}\n\nSummary:\n{summary}",
+    "test_mode": false,
+    "test_mode_limit": 30,
+    "batch_size_summaries": 10,
+    "batch_size_categorization": 10
+}
\ No newline at end of file
diff --git a/experiments/ethereum-repo-clusters/requirements.txt b/experiments/ethereum-repo-clusters/requirements.txt
new file mode 100644
index 00000000..7ab7b91e
--- /dev/null
+++ b/experiments/ethereum-repo-clusters/requirements.txt
@@ -0,0 +1,8 @@
+pandas>=2.0.0
+requests>=2.31.0
+pyoso>=0.1.0
+google-generativeai>=0.3.0
+pydantic>=2.0.0
+python-dotenv>=1.0.0
+click>=8.0.0
+pyarrow>=14.0.0 # For parquet support
diff --git a/experiments/ethereum-repo-clusters/setup.py b/experiments/ethereum-repo-clusters/setup.py
new file mode 100644
index 00000000..1b43d1c3
--- /dev/null
+++ b/experiments/ethereum-repo-clusters/setup.py
@@ -0,0 +1,16 @@
+from setuptools import setup, find_packages
+
+setup(
+    name="devtooling_labels",
+    version="0.1.0",
+    packages=find_packages(),
+    install_requires=[
+        "pandas>=2.0.0",
+        "requests>=2.31.0",
+        "pyoso>=0.1.0",
+        "google-generativeai>=0.3.0",
+        "pydantic>=2.0.0",
+        "python-dotenv>=1.0.0",
+    ],
+    python_requires=">=3.8",
+) 
\ No newline at end of file
diff --git a/tutorials/FundingMetrics.ipynb b/tutorials/FundingMetrics.ipynb
new file mode 100644
index 00000000..d64c4876
--- /dev/null
+++ b/tutorials/FundingMetrics.ipynb
@@ -0,0 +1,1192 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "41b714c9-c749-4d0d-ad59-71e0c035d325",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ! pip install pyoso"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "413d143d-4494-4812-8cae-d28f47cc397e",
+   "metadata": {},
+   "source": [
+    "## Setup\n",
+    "\n",
+    "Load environment variables, import necessary libraries, and initialize the OSO client"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "988cd219-8b29-469d-9e7a-46f7a965ddc7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dotenv import load_dotenv\n",
+    "import os\n",
+    "import pandas as pd\n",
+    "from pyoso import Client\n",
+    "\n",
+    "load_dotenv()\n",
+    "\n",
+    "OSO_API_KEY = os.environ['OSO_API_KEY']\n",
+    "client = Client(api_key=OSO_API_KEY)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4231b926-89ab-48de-8746-b0d10f44c470",
+   "metadata": {},
+   "source": [
+    "## Testing\n",
+    "\n",
+    "Query the metrics table for all metric names containing '_funding_' and display them in alphabetical order"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "77fe6763-7d49-47a5-b5c6-68264034ab0c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>metric_name</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>GITCOIN_DONATIONS_funding_awarded_biannually</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>GITCOIN_DONATIONS_funding_awarded_daily</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>GITCOIN_DONATIONS_funding_awarded_monthly</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>GITCOIN_DONATIONS_funding_awarded_over_all_time</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>GITCOIN_DONATIONS_funding_awarded_quarterly</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>GITCOIN_DONATIONS_funding_awarded_weekly</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>GITCOIN_DONATIONS_funding_awarded_yearly</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>GITCOIN_MATCHING_funding_awarded_biannually</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>GITCOIN_MATCHING_funding_awarded_daily</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>GITCOIN_MATCHING_funding_awarded_monthly</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>GITCOIN_MATCHING_funding_awarded_over_all_time</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>GITCOIN_MATCHING_funding_awarded_quarterly</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>GITCOIN_MATCHING_funding_awarded_weekly</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>GITCOIN_MATCHING_funding_awarded_yearly</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>OPEN_COLLECTIVE_funding_received_biannually</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>OPEN_COLLECTIVE_funding_received_daily</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>OPEN_COLLECTIVE_funding_received_monthly</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>OPEN_COLLECTIVE_funding_received_over_all_time</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>OPEN_COLLECTIVE_funding_received_quarterly</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>OPEN_COLLECTIVE_funding_received_weekly</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>OPEN_COLLECTIVE_funding_received_yearly</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
+       "      <td>OSS_FUNDING_funding_awarded_biannually</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>OSS_FUNDING_funding_awarded_daily</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>OSS_FUNDING_funding_awarded_monthly</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>OSS_FUNDING_funding_awarded_over_all_time</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25</th>\n",
+       "      <td>OSS_FUNDING_funding_awarded_quarterly</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26</th>\n",
+       "      <td>OSS_FUNDING_funding_awarded_weekly</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>27</th>\n",
+       "      <td>OSS_FUNDING_funding_awarded_yearly</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                        metric_name\n",
+       "0      GITCOIN_DONATIONS_funding_awarded_biannually\n",
+       "1           GITCOIN_DONATIONS_funding_awarded_daily\n",
+       "2         GITCOIN_DONATIONS_funding_awarded_monthly\n",
+       "3   GITCOIN_DONATIONS_funding_awarded_over_all_time\n",
+       "4       GITCOIN_DONATIONS_funding_awarded_quarterly\n",
+       "5          GITCOIN_DONATIONS_funding_awarded_weekly\n",
+       "6          GITCOIN_DONATIONS_funding_awarded_yearly\n",
+       "7       GITCOIN_MATCHING_funding_awarded_biannually\n",
+       "8            GITCOIN_MATCHING_funding_awarded_daily\n",
+       "9          GITCOIN_MATCHING_funding_awarded_monthly\n",
+       "10   GITCOIN_MATCHING_funding_awarded_over_all_time\n",
+       "11       GITCOIN_MATCHING_funding_awarded_quarterly\n",
+       "12          GITCOIN_MATCHING_funding_awarded_weekly\n",
+       "13          GITCOIN_MATCHING_funding_awarded_yearly\n",
+       "14      OPEN_COLLECTIVE_funding_received_biannually\n",
+       "15           OPEN_COLLECTIVE_funding_received_daily\n",
+       "16         OPEN_COLLECTIVE_funding_received_monthly\n",
+       "17   OPEN_COLLECTIVE_funding_received_over_all_time\n",
+       "18       OPEN_COLLECTIVE_funding_received_quarterly\n",
+       "19          OPEN_COLLECTIVE_funding_received_weekly\n",
+       "20          OPEN_COLLECTIVE_funding_received_yearly\n",
+       "21           OSS_FUNDING_funding_awarded_biannually\n",
+       "22                OSS_FUNDING_funding_awarded_daily\n",
+       "23              OSS_FUNDING_funding_awarded_monthly\n",
+       "24        OSS_FUNDING_funding_awarded_over_all_time\n",
+       "25            OSS_FUNDING_funding_awarded_quarterly\n",
+       "26               OSS_FUNDING_funding_awarded_weekly\n",
+       "27               OSS_FUNDING_funding_awarded_yearly"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "client.to_pandas(\"\"\"\n",
+    "SELECT metric_name\n",
+    "FROM metrics_v0\n",
+    "WHERE metric_name LIKE '%_funding_%'\n",
+    "ORDER BY 1\n",
+    "\"\"\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e85f23d4-3605-40f6-8228-ce175de20f30",
+   "metadata": {},
+   "source": [
+    "## Aggregate funding metrics\n",
+    "\n",
+    "### By source\n",
+    "\n",
+    "We currently support CSV data uploads via [oss-funding](https://github.com/opensource-observer/oss-funding) and Gitcoin Grants. We also have Open Collective deposits, but they don't show up here (yet)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "aa31b0d1-2117-4fb0-8b6e-f206500602d2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>metric_name</th>\n",
+       "      <th>total_amount_in_usd</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>OSS_FUNDING_funding_awarded_over_all_time</td>\n",
+       "      <td>364887873.600968</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>GITCOIN_MATCHING_funding_awarded_over_all_time</td>\n",
+       "      <td>13305117.158144</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>GITCOIN_DONATIONS_funding_awarded_over_all_time</td>\n",
+       "      <td>11666103.711711</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                       metric_name  total_amount_in_usd\n",
+       "0        OSS_FUNDING_funding_awarded_over_all_time     364887873.600968\n",
+       "1   GITCOIN_MATCHING_funding_awarded_over_all_time      13305117.158144\n",
+       "2  GITCOIN_DONATIONS_funding_awarded_over_all_time      11666103.711711"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "client.to_pandas(\"\"\"\n",
+    "SELECT\n",
+    "  m.metric_name,\n",
+    "  SUM(km.amount) AS total_amount_in_usd\n",
+    "FROM key_metrics_by_project_v0 AS km\n",
+    "JOIN metrics_v0 AS m ON km.metric_id = m.metric_id\n",
+    "WHERE m.metric_name LIKE '%_funding_%'\n",
+    "GROUP BY 1\n",
+    "ORDER BY 2 DESC\n",
+    "\"\"\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0d2902b8-468b-44ae-bae9-d797ec18de0d",
+   "metadata": {},
+   "source": [
+    "### To projects\n",
+    "\n",
+    "We can also see the largest project recipients with this query."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "c9edc7a4-19d1-443c-bc01-ced2c6ffda65",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>project_display_name</th>\n",
+       "      <th>total_amount_in_usd</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>GMX</td>\n",
+       "      <td>21000000.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>MUX Protocol</td>\n",
+       "      <td>10876479.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Synthetix</td>\n",
+       "      <td>10022628.074157</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Perpetual Protocol</td>\n",
+       "      <td>9287212.140718</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Gains Network</td>\n",
+       "      <td>7898396.135</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Velodrome</td>\n",
+       "      <td>7895037.76024</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Camelot</td>\n",
+       "      <td>5407500.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Stargate Finance</td>\n",
+       "      <td>5289458.865658</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>Vertex Protocol</td>\n",
+       "      <td>5250000.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Radiant</td>\n",
+       "      <td>4991077.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  project_display_name  total_amount_in_usd\n",
+       "0                  GMX           21000000.0\n",
+       "1         MUX Protocol           10876479.0\n",
+       "2            Synthetix      10022628.074157\n",
+       "3   Perpetual Protocol       9287212.140718\n",
+       "4        Gains Network          7898396.135\n",
+       "5            Velodrome        7895037.76024\n",
+       "6              Camelot            5407500.0\n",
+       "7     Stargate Finance       5289458.865658\n",
+       "8      Vertex Protocol            5250000.0\n",
+       "9              Radiant            4991077.0"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "client.to_pandas(\"\"\"\n",
+    "SELECT\n",
+    "  p.display_name AS project_display_name,\n",
+    "  SUM(km.amount) AS total_amount_in_usd\n",
+    "FROM key_metrics_by_project_v0 AS km\n",
+    "JOIN metrics_v0 AS m ON km.metric_id = m.metric_id\n",
+    "JOIN projects_v1 AS p ON km.project_id = p.project_id\n",
+    "WHERE m.metric_name LIKE '%_funding_awarded_over_all_time'\n",
+    "GROUP BY 1\n",
+    "ORDER BY 2 DESC\n",
+    "LIMIT 10\n",
+    "\"\"\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0841ca11-505d-47bc-acdc-5ca8314af991",
+   "metadata": {},
+   "source": [
+    "### To projects from a specific source"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "9783c7e0-c2e0-4f1c-9dda-42d458cb39ce",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>project_display_name</th>\n",
+       "      <th>total_amount_in_usd</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Gitcoin</td>\n",
+       "      <td>1099895.038376</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Revoke</td>\n",
+       "      <td>748859.365745</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>DefiLlama</td>\n",
+       "      <td>429924.507285</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Hey</td>\n",
+       "      <td>360529.24178</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>JediSwap</td>\n",
+       "      <td>333277.670918</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Dark Forest</td>\n",
+       "      <td>332205.420888</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>ZigZag Exchange</td>\n",
+       "      <td>210175.931949</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>ethers.js</td>\n",
+       "      <td>190702.539836</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>rotki</td>\n",
+       "      <td>174990.340416</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Taho</td>\n",
+       "      <td>170854.869607</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  project_display_name  total_amount_in_usd\n",
+       "0              Gitcoin       1099895.038376\n",
+       "1               Revoke        748859.365745\n",
+       "2            DefiLlama        429924.507285\n",
+       "3                  Hey         360529.24178\n",
+       "4             JediSwap        333277.670918\n",
+       "5          Dark Forest        332205.420888\n",
+       "6      ZigZag Exchange        210175.931949\n",
+       "7            ethers.js        190702.539836\n",
+       "8                rotki        174990.340416\n",
+       "9                 Taho        170854.869607"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "client.to_pandas(\"\"\"\n",
+    "SELECT\n",
+    "  p.display_name AS project_display_name,\n",
+    "  SUM(km.amount) AS total_amount_in_usd\n",
+    "FROM key_metrics_by_project_v0 AS km\n",
+    "JOIN metrics_v0 AS m ON km.metric_id = m.metric_id\n",
+    "JOIN projects_v1 AS p ON km.project_id = p.project_id\n",
+    "WHERE m.metric_name = 'GITCOIN_DONATIONS_funding_awarded_over_all_time'\n",
+    "GROUP BY 1\n",
+    "ORDER BY 2 DESC\n",
+    "LIMIT 10\n",
+    "\"\"\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "807ac8df-5c90-4c14-821b-d29ebab1089c",
+   "metadata": {},
+   "source": [
+    "### To projects from a specific source and time frame"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "69f8d8ff-f902-4cce-bc3f-d23182f84de3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>project_display_name</th>\n",
+       "      <th>total_amount_in_usd</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Gitcoin</td>\n",
+       "      <td>797206.330376</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Dark Forest</td>\n",
+       "      <td>297517.115925</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>ZigZag Exchange</td>\n",
+       "      <td>199746.433382</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>ethers.js</td>\n",
+       "      <td>129500.966707</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Prysm Ethereum Client</td>\n",
+       "      <td>128522.705766</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>rotki</td>\n",
+       "      <td>122666.927997</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>ZeroPool</td>\n",
+       "      <td>116795.642612</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Lighthouse by Sigma Prime</td>\n",
+       "      <td>114759.839844</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>The Tor Project</td>\n",
+       "      <td>110669.738113</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Hardhat</td>\n",
+       "      <td>110539.758225</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        project_display_name  total_amount_in_usd\n",
+       "0                    Gitcoin        797206.330376\n",
+       "1                Dark Forest        297517.115925\n",
+       "2            ZigZag Exchange        199746.433382\n",
+       "3                  ethers.js        129500.966707\n",
+       "4      Prysm Ethereum Client        128522.705766\n",
+       "5                      rotki        122666.927997\n",
+       "6                   ZeroPool        116795.642612\n",
+       "7  Lighthouse by Sigma Prime        114759.839844\n",
+       "8            The Tor Project        110669.738113\n",
+       "9                    Hardhat        110539.758225"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "client.to_pandas(\"\"\"\n",
+    "SELECT\n",
+    "  p.display_name AS project_display_name,\n",
+    "  SUM(tm.amount) AS total_amount_in_usd\n",
+    "FROM timeseries_metrics_by_project_v0 AS tm\n",
+    "JOIN metrics_v0 AS m ON tm.metric_id = m.metric_id\n",
+    "JOIN projects_v1 AS p ON tm.project_id = p.project_id\n",
+    "WHERE m.metric_name = 'GITCOIN_DONATIONS_funding_awarded_yearly'\n",
+    "AND tm.sample_date < DATE '2022-01-01'\n",
+    "GROUP BY 1\n",
+    "ORDER BY 2 DESC\n",
+    "LIMIT 10\n",
+    "\"\"\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fe5ad20c-7772-4e94-83ec-77baea215d00",
+   "metadata": {},
+   "source": [
+    "## More granular analysis\n",
+    "\n",
+    "### Gitcoin\n",
+    "\n",
+    "Deep dive on Gitcoin grants to a specific project"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "60aa05c7-098f-4e6f-8183-a01fc5ed9652",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>time</th>\n",
+       "      <th>round_number</th>\n",
+       "      <th>round_name</th>\n",
+       "      <th>event_source</th>\n",
+       "      <th>donor_address</th>\n",
+       "      <th>amount_in_usd</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>2023-02-09 15:58:07.078</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>Gitcoin Grants</td>\n",
+       "      <td>GITCOIN_DONATIONS</td>\n",
+       "      <td>0x386ea3171dcc9405311fd75b316cc2a87ecadeca</td>\n",
+       "      <td>617893.575</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2023-08-29 08:32:57.000</td>\n",
+       "      <td>18</td>\n",
+       "      <td>Web3 Open Source Software</td>\n",
+       "      <td>GITCOIN_MATCHING</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>15001.1375</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2024-11-25 14:26:59.000</td>\n",
+       "      <td>22</td>\n",
+       "      <td>GG22 OSS - dApps and Apps</td>\n",
+       "      <td>GITCOIN_MATCHING</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>14984.28125</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>2024-05-31 14:35:02.000</td>\n",
+       "      <td>20</td>\n",
+       "      <td>dApps &amp; Apps</td>\n",
+       "      <td>GITCOIN_MATCHING</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>14979.978125</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>2023-11-29 20:18:47.000</td>\n",
+       "      <td>19</td>\n",
+       "      <td>Web3 Open Source Software</td>\n",
+       "      <td>GITCOIN_MATCHING</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>14849.591509</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>2022-08-24 00:00:00.000</td>\n",
+       "      <td>15</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>GITCOIN_MATCHING</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>12500.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>2024-08-26 15:19:00.000</td>\n",
+       "      <td>21</td>\n",
+       "      <td>GG21: Thriving Arbitrum Summer</td>\n",
+       "      <td>GITCOIN_MATCHING</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>9839.680095</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>2022-08-24 00:00:00.000</td>\n",
+       "      <td>15</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>GITCOIN_MATCHING</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>7410.488854</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>2024-05-07 10:04:49.000</td>\n",
+       "      <td>20</td>\n",
+       "      <td>dApps &amp; Apps</td>\n",
+       "      <td>GITCOIN_DONATIONS</td>\n",
+       "      <td>0xe2a26d5174b133abc4b338df1b07295f03a4c85e</td>\n",
+       "      <td>1000.42865</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>2024-05-06 17:29:47.000</td>\n",
+       "      <td>20</td>\n",
+       "      <td>dApps &amp; Apps</td>\n",
+       "      <td>GITCOIN_DONATIONS</td>\n",
+       "      <td>0x60a06b2eee871e349331143ef173ecefd7a8ce01</td>\n",
+       "      <td>537.338562</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                      time  round_number                      round_name  \\\n",
+       "0  2023-02-09 15:58:07.078          <NA>                 Gitcoin Grants    \n",
+       "1  2023-08-29 08:32:57.000            18       Web3 Open Source Software   \n",
+       "2  2024-11-25 14:26:59.000            22       GG22 OSS - dApps and Apps   \n",
+       "3  2024-05-31 14:35:02.000            20                   dApps & Apps    \n",
+       "4  2023-11-29 20:18:47.000            19      Web3 Open Source Software    \n",
+       "5  2022-08-24 00:00:00.000            15                            <NA>   \n",
+       "6  2024-08-26 15:19:00.000            21  GG21: Thriving Arbitrum Summer   \n",
+       "7  2022-08-24 00:00:00.000            15                            <NA>   \n",
+       "8  2024-05-07 10:04:49.000            20                   dApps & Apps    \n",
+       "9  2024-05-06 17:29:47.000            20                   dApps & Apps    \n",
+       "\n",
+       "        event_source                               donor_address  \\\n",
+       "0  GITCOIN_DONATIONS  0x386ea3171dcc9405311fd75b316cc2a87ecadeca   \n",
+       "1   GITCOIN_MATCHING                                        <NA>   \n",
+       "2   GITCOIN_MATCHING                                        <NA>   \n",
+       "3   GITCOIN_MATCHING                                        <NA>   \n",
+       "4   GITCOIN_MATCHING                                        <NA>   \n",
+       "5   GITCOIN_MATCHING                                        <NA>   \n",
+       "6   GITCOIN_MATCHING                                        <NA>   \n",
+       "7   GITCOIN_MATCHING                                        <NA>   \n",
+       "8  GITCOIN_DONATIONS  0xe2a26d5174b133abc4b338df1b07295f03a4c85e   \n",
+       "9  GITCOIN_DONATIONS  0x60a06b2eee871e349331143ef173ecefd7a8ce01   \n",
+       "\n",
+       "   amount_in_usd  \n",
+       "0     617893.575  \n",
+       "1     15001.1375  \n",
+       "2    14984.28125  \n",
+       "3   14979.978125  \n",
+       "4   14849.591509  \n",
+       "5        12500.0  \n",
+       "6    9839.680095  \n",
+       "7    7410.488854  \n",
+       "8     1000.42865  \n",
+       "9     537.338562  "
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "client.to_pandas(\"\"\"\n",
+    "  SELECT\n",
+    "    time,\n",
+    "    round_number,\n",
+    "    round_name,\n",
+    "    event_source,\n",
+    "    donor_address,\n",
+    "    amount_in_usd\n",
+    "  FROM int_events__gitcoin_funding\n",
+    "  WHERE gitcoin_group_project_name = 'revokecash'\n",
+    "  ORDER BY amount_in_usd DESC\n",
+    "  LIMIT 10\n",
+    "\"\"\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "144e833e-cf24-4659-a43b-35505d95501a",
+   "metadata": {},
+   "source": [
+    "## OSS Funding\n",
+    "\n",
+    "Overview of specific funders and grant pools in oss-funding data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "fa50c103-7c3c-4fb2-bcb1-ff754a8b53f6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>from_funder_name</th>\n",
+       "      <th>grant_pools</th>\n",
+       "      <th>amount_in_usd</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>optimism</td>\n",
+       "      <td>12</td>\n",
+       "      <td>240450291.744</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>arbitrumfoundation</td>\n",
+       "      <td>1</td>\n",
+       "      <td>122850952.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>stellar</td>\n",
+       "      <td>29</td>\n",
+       "      <td>32989032.98</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>octant-golemfoundation</td>\n",
+       "      <td>5</td>\n",
+       "      <td>3965429.51329</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>dao-drops-dorgtech</td>\n",
+       "      <td>1</td>\n",
+       "      <td>250001.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>clrfund</td>\n",
+       "      <td>1</td>\n",
+       "      <td>83028.740386</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         from_funder_name  grant_pools  amount_in_usd\n",
+       "0                optimism           12  240450291.744\n",
+       "1      arbitrumfoundation            1    122850952.0\n",
+       "2                 stellar           29    32989032.98\n",
+       "3  octant-golemfoundation            5  3965429.51329\n",
+       "4      dao-drops-dorgtech            1       250001.0\n",
+       "5                 clrfund            1   83028.740386"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "client.to_pandas(\"\"\"\n",
+    "SELECT\n",
+    "  from_funder_name,\n",
+    "  COUNT(DISTINCT grant_pool_name) AS grant_pools,\n",
+    "  SUM(amount) AS amount_in_usd\n",
+    "FROM stg_ossd__current_funding\n",
+    "GROUP BY 1\n",
+    "ORDER BY 3 DESC\n",
+    "\"\"\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f7aaf46c-45b8-444e-a6f1-27286085a463",
+   "metadata": {},
+   "source": [
+    "## Funding flows\n",
+    "\n",
+    "We can use this to construct a simple sankey diagram of funding flows"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "1bd3a7a8-8cac-420e-9e7b-5259ff0e3b2d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>from_project_id</th>\n",
+       "      <th>to_project_id</th>\n",
+       "      <th>funder</th>\n",
+       "      <th>project</th>\n",
+       "      <th>amount</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>2142</th>\n",
+       "      <td>Qgbm336fY9862LN2Czg3UX04A3p7I/79Bv2M4D61DAI=</td>\n",
+       "      <td>8IKXraxq1pDuQD1xaDI20cjFrel55TZ/zf6LmP69qEg=</td>\n",
+       "      <td>Gitcoin</td>\n",
+       "      <td>efdevcon</td>\n",
+       "      <td>13.531599</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2143</th>\n",
+       "      <td>Qgbm336fY9862LN2Czg3UX04A3p7I/79Bv2M4D61DAI=</td>\n",
+       "      <td>79HQoZtyZftibazh6Yz63aU06XODWs7b/9h4JAqPa1s=</td>\n",
+       "      <td>Gitcoin</td>\n",
+       "      <td>LexDAO</td>\n",
+       "      <td>86499.728685</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2144</th>\n",
+       "      <td>5Fgf9xv3CxTV+YbSShdY9XCJs7tgW8KNwQWq9rHUEsQ=</td>\n",
+       "      <td>79HQoZtyZftibazh6Yz63aU06XODWs7b/9h4JAqPa1s=</td>\n",
+       "      <td>clr.fund</td>\n",
+       "      <td>LexDAO</td>\n",
+       "      <td>193.952856</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2145</th>\n",
+       "      <td>Qgbm336fY9862LN2Czg3UX04A3p7I/79Bv2M4D61DAI=</td>\n",
+       "      <td>yEebFy4M1iAdb9+YQmdssSx9Qf+ZXfSVguL/JyidngI=</td>\n",
+       "      <td>Gitcoin</td>\n",
+       "      <td>DeFiEye</td>\n",
+       "      <td>224058.115245</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2146</th>\n",
+       "      <td>5Fgf9xv3CxTV+YbSShdY9XCJs7tgW8KNwQWq9rHUEsQ=</td>\n",
+       "      <td>JQtLQErRk0u41xS292Cg+s3cRr8LaD5lQ2kME/Syp2Q=</td>\n",
+       "      <td>clr.fund</td>\n",
+       "      <td>Asilo Digital</td>\n",
+       "      <td>703.639308</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                   from_project_id  \\\n",
+       "2142  Qgbm336fY9862LN2Czg3UX04A3p7I/79Bv2M4D61DAI=   \n",
+       "2143  Qgbm336fY9862LN2Czg3UX04A3p7I/79Bv2M4D61DAI=   \n",
+       "2144  5Fgf9xv3CxTV+YbSShdY9XCJs7tgW8KNwQWq9rHUEsQ=   \n",
+       "2145  Qgbm336fY9862LN2Czg3UX04A3p7I/79Bv2M4D61DAI=   \n",
+       "2146  5Fgf9xv3CxTV+YbSShdY9XCJs7tgW8KNwQWq9rHUEsQ=   \n",
+       "\n",
+       "                                     to_project_id    funder        project  \\\n",
+       "2142  8IKXraxq1pDuQD1xaDI20cjFrel55TZ/zf6LmP69qEg=   Gitcoin       efdevcon   \n",
+       "2143  79HQoZtyZftibazh6Yz63aU06XODWs7b/9h4JAqPa1s=   Gitcoin         LexDAO   \n",
+       "2144  79HQoZtyZftibazh6Yz63aU06XODWs7b/9h4JAqPa1s=  clr.fund         LexDAO   \n",
+       "2145  yEebFy4M1iAdb9+YQmdssSx9Qf+ZXfSVguL/JyidngI=   Gitcoin        DeFiEye   \n",
+       "2146  JQtLQErRk0u41xS292Cg+s3cRr8LaD5lQ2kME/Syp2Q=  clr.fund  Asilo Digital   \n",
+       "\n",
+       "             amount  \n",
+       "2142      13.531599  \n",
+       "2143   86499.728685  \n",
+       "2144     193.952856  \n",
+       "2145  224058.115245  \n",
+       "2146     703.639308  "
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "query = \"\"\"\n",
+    "SELECT\n",
+    "  fp.project_id AS from_project_id,\n",
+    "  tp.project_id AS to_project_id,\n",
+    "  fp.display_name AS funder,\n",
+    "  tp.display_name AS project,\n",
+    "  SUM(e.amount) AS amount\n",
+    "FROM int_events_daily__funding AS e\n",
+    "JOIN artifacts_by_project_v1 AS fa\n",
+    "  ON e.from_artifact_id = fa.artifact_id\n",
+    "JOIN artifacts_by_project_v1 AS ta\n",
+    "  ON e.to_artifact_id = ta.artifact_id\n",
+    "JOIN projects_v1 AS fp\n",
+    "  ON fa.project_id = fp.project_id\n",
+    "JOIN projects_v1 AS tp\n",
+    "  ON ta.project_id = tp.project_id\n",
+    "GROUP BY 1,2,3,4\n",
+    "\"\"\"\n",
+    "df = client.to_pandas(query)\n",
+    "df.tail()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

	repo_artifact_id	project_id	project_name	display_name	repo_artifact_namespace	repo_artifact_name	created_at	updated_at	star_count	fork_count	...	is_actively_maintained	final_recommendation	processing_timestamp	summary	readme_status	protocol_architect	ecosystem_analyst	security_researcher	user_experience_advocate	governance_specialist
0	jXXy/fnXRva/c1jf/Weav9O3pWDHf/lVArjj0/oteUM=	KLkMfahLmIEtzAbihkJ4U9p0e/3zWn0iN6xBrwN++lU=	ethereum-attestation-service	Ethereum Attestation Service	ethereum-attestation-service	eas-docs-site	2022-11-09 19:39:56.000 UTC	2025-06-02 15:51:08.000 UTC	17	39	...	True	Developer Experience Tools	2025-06-06T00:55:35.737447	The project provides documentation for the Eth...	SUCCESS	Developer Experience Tools	Developer Experience Tools	Developer Experience Tools	Developer Experience Tools	Application-Specific & Niche Tools
1	Ymt6ZmVh75JL7ml3IM9hU32qFd+GB84kXijLttRFS+w=	4JnmAZ2ikJpcjoRMX+2ZLxAQ10don2FQd6yPyDaBZ20=	erigontech	erigontech	erigontech	gmp-wasm	2020-12-16 08:27:02.000 UTC	2025-03-24 16:40:59.000 UTC	17	4	...	True	Cryptography & Primitives	2025-06-06T00:55:27.645719	The GNU MP Library provides arbitrary precisio...	SUCCESS	Cryptography & Primitives	Cryptography & Primitives	Cryptography & Primitives	Cryptography & Primitives	Cryptography & Primitives
2	sTL/I78T3P6uyVN+En480uSHiTXT7UdHPmKQlvWxCkc=	4JnmAZ2ikJpcjoRMX+2ZLxAQ10don2FQd6yPyDaBZ20=	erigontech	erigontech	erigontech	diagnostics	2023-02-22 11:05:42.000 UTC	2025-04-25 07:42:52.000 UTC	17	21	...	True	Infrastructure & Node Operations	2025-06-06T00:55:20.010674	The Erigon Diagnostics System is a web applica...	SUCCESS	Infrastructure & Node Operations	Infrastructure & Node Operations	DeFi Security & Monitoring	Infrastructure & Node Operations	Infrastructure & Node Operations
3	9C23r6x0hqtbR/lB/1nYpc5KVgCtm4ga+lOJa4gd2cY=	Fs/BFdYMfeuzzzWPOX3dtOA6Z4AOJsB2eO2JIoZEzUo=	ensdomains	ENS	ensdomains	court	2018-05-02 19:41:02.000 UTC	2025-05-20 03:41:25.000 UTC	17	7	...	True	Application-Specific & Niche Tools	2025-06-06T00:55:11.604116	Court provides smart contracts for arbitrating...	SUCCESS	Application-Specific & Niche Tools	Application-Specific & Niche Tools	DeFi Security & Monitoring	Application-Specific & Niche Tools	Governance & DAO Tooling
4	j9aT6b4e9dCsCbJ42JXen90EHik4VhyLFvX2RjeiJGM=	Fs/BFdYMfeuzzzWPOX3dtOA6Z4AOJsB2eO2JIoZEzUo=	ensdomains	ENS	ensdomains	op-resolver	2022-11-03 11:14:36.000 UTC	2025-05-20 03:21:33.000 UTC	17	6	...	True	Interoperability & Cross-chain	2025-06-06T00:55:03.917944	The Optimism Resolver project facilitates stor...	SUCCESS	Infrastructure & Node Operations	Interoperability & Cross-chain	Interoperability & Cross-chain	User Interface & Integration SDKs	Governance & DAO Tooling
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
5234	AcuAtRmOCfY1rQN0rAx8iP5pdbgveBahZYSWK2leQq4=	AmxsQKHnsygqA+a7WJawinHjVclh84R+edks3EL9jiM=	fuellabs	Fuel Network	fuellabs	fuels-rs	2021-10-31 22:33:54.000 UTC	2025-06-03 17:34:29.000 UTC	43747	1355	...	True	Development Frameworks	2025-06-05T14:24:14.479181	The fuels-rs project provides a Rust SDK for t...	SUCCESS	Core Protocol Interfaces	Development Frameworks	Development Frameworks	Development Frameworks	Development Frameworks
5235	JfvNeHojsqThZKXGfbrSSW4JIf2db88eIku67txzj9w=	vD6QgU2nKpWiutcCnblDJkVHtDkLDH6oyITV+xpe3+g=	go-ethereum	geth	ethereum	go-ethereum	2013-12-26 13:05:46.000 UTC	2025-06-03 16:54:54.000 UTC	49065	20888	...	True	Infrastructure & Node Operations	2025-06-05T14:24:08.096520	Go Ethereum (geth) is a Golang implementation ...	SUCCESS	Infrastructure & Node Operations	Infrastructure & Node Operations	Infrastructure & Node Operations	Infrastructure & Node Operations	Infrastructure & Node Operations
5236	imBvQgAogfFYL0+hque3sUxe+dN53nsDQFoz1q1jgDA=	AmxsQKHnsygqA+a7WJawinHjVclh84R+edks3EL9jiM=	fuellabs	Fuel Network	fuellabs	fuel-core	2020-08-27 21:12:14.000 UTC	2025-06-03 17:34:30.000 UTC	57637	2852	...	True	Infrastructure & Node Operations	2025-06-05T14:24:01.176979	The Fuel client implements a Fuel node, provid...	SUCCESS	Infrastructure & Node Operations	Infrastructure & Node Operations	Infrastructure & Node Operations	Infrastructure & Node Operations	Infrastructure & Node Operations
5237	XK2KsRrMXU8N9WUAXb0V+X2pZWgx9H2UCtaJ6IONUC4=	AmxsQKHnsygqA+a7WJawinHjVclh84R+edks3EL9jiM=	fuellabs	Fuel Network	fuellabs	sway	2021-01-19 20:54:33.000 UTC	2025-06-03 17:34:31.000 UTC	62255	5405	...	True	Language & Compilation Tools	2025-06-05T14:23:54.181337	Sway is a programming language designed for th...	SUCCESS	Language & Compilation Tools	Language & Compilation Tools	Language & Compilation Tools	Language & Compilation Tools	Language & Compilation Tools
5238	ACDSfw399At2CyBKEzgNCwOZ3zvC990eWZjGw+Z8isA=	cJt1yXO/geeLxyt++Pe5iU+kUyklaoGot3rHqrDNk1o=	base-org	Base	base-org	node	2023-02-01 13:55:02.000 UTC	2025-02-10 01:22:12.000 UTC	68568	2635	...	True	Infrastructure & Node Operations	2025-06-05T14:23:47.813647	The Base Node project provides Docker configur...	SUCCESS	Infrastructure & Node Operations	Infrastructure & Node Operations	Infrastructure & Node Operations	Infrastructure & Node Operations	Infrastructure & Node Operations
	metric_name
0	GITCOIN_DONATIONS_funding_awarded_biannually
1	GITCOIN_DONATIONS_funding_awarded_daily
2	GITCOIN_DONATIONS_funding_awarded_monthly
3	GITCOIN_DONATIONS_funding_awarded_over_all_time
4	GITCOIN_DONATIONS_funding_awarded_quarterly
5	GITCOIN_DONATIONS_funding_awarded_weekly
6	GITCOIN_DONATIONS_funding_awarded_yearly
7	GITCOIN_MATCHING_funding_awarded_biannually
8	GITCOIN_MATCHING_funding_awarded_daily
9	GITCOIN_MATCHING_funding_awarded_monthly
10	GITCOIN_MATCHING_funding_awarded_over_all_time
11	GITCOIN_MATCHING_funding_awarded_quarterly
12	GITCOIN_MATCHING_funding_awarded_weekly
13	GITCOIN_MATCHING_funding_awarded_yearly
14	OPEN_COLLECTIVE_funding_received_biannually
15	OPEN_COLLECTIVE_funding_received_daily
16	OPEN_COLLECTIVE_funding_received_monthly
17	OPEN_COLLECTIVE_funding_received_over_all_time
18	OPEN_COLLECTIVE_funding_received_quarterly
19	OPEN_COLLECTIVE_funding_received_weekly
20	OPEN_COLLECTIVE_funding_received_yearly
21	OSS_FUNDING_funding_awarded_biannually
22	OSS_FUNDING_funding_awarded_daily
23	OSS_FUNDING_funding_awarded_monthly
24	OSS_FUNDING_funding_awarded_over_all_time
25	OSS_FUNDING_funding_awarded_quarterly
26	OSS_FUNDING_funding_awarded_weekly
27	OSS_FUNDING_funding_awarded_yearly
	metric_name	total_amount_in_usd
0	OSS_FUNDING_funding_awarded_over_all_time	364887873.600968
1	GITCOIN_MATCHING_funding_awarded_over_all_time	13305117.158144
2	GITCOIN_DONATIONS_funding_awarded_over_all_time	11666103.711711
	project_display_name	total_amount_in_usd
0	GMX	21000000.0
1	MUX Protocol	10876479.0
2	Synthetix	10022628.074157
3	Perpetual Protocol	9287212.140718
4	Gains Network	7898396.135
5	Velodrome	7895037.76024
6	Camelot	5407500.0
7	Stargate Finance	5289458.865658
8	Vertex Protocol	5250000.0
9	Radiant	4991077.0
	project_display_name	total_amount_in_usd
0	Gitcoin	1099895.038376
1	Revoke	748859.365745
2	DefiLlama	429924.507285
3	Hey	360529.24178
4	JediSwap	333277.670918
5	Dark Forest	332205.420888
6	ZigZag Exchange	210175.931949
7	ethers.js	190702.539836
8	rotki	174990.340416
9	Taho	170854.869607
	project_display_name	total_amount_in_usd
0	Gitcoin	797206.330376
1	Dark Forest	297517.115925
2	ZigZag Exchange	199746.433382
3	ethers.js	129500.966707
4	Prysm Ethereum Client	128522.705766
5	rotki	122666.927997
6	ZeroPool	116795.642612
7	Lighthouse by Sigma Prime	114759.839844
8	The Tor Project	110669.738113
9	Hardhat	110539.758225
	time	round_number	round_name	event_source	donor_address	amount_in_usd
0	2023-02-09 15:58:07.078	<NA>	Gitcoin Grants	GITCOIN_DONATIONS	0x386ea3171dcc9405311fd75b316cc2a87ecadeca	617893.575
1	2023-08-29 08:32:57.000	18	Web3 Open Source Software	GITCOIN_MATCHING	<NA>	15001.1375
2	2024-11-25 14:26:59.000	22	GG22 OSS - dApps and Apps	GITCOIN_MATCHING	<NA>	14984.28125
3	2024-05-31 14:35:02.000	20	dApps & Apps	GITCOIN_MATCHING	<NA>	14979.978125
4	2023-11-29 20:18:47.000	19	Web3 Open Source Software	GITCOIN_MATCHING	<NA>	14849.591509
5	2022-08-24 00:00:00.000	15	<NA>	GITCOIN_MATCHING	<NA>	12500.0
6	2024-08-26 15:19:00.000	21	GG21: Thriving Arbitrum Summer	GITCOIN_MATCHING	<NA>	9839.680095
7	2022-08-24 00:00:00.000	15	<NA>	GITCOIN_MATCHING	<NA>	7410.488854
8	2024-05-07 10:04:49.000	20	dApps & Apps	GITCOIN_DONATIONS	0xe2a26d5174b133abc4b338df1b07295f03a4c85e	1000.42865
9	2024-05-06 17:29:47.000	20	dApps & Apps	GITCOIN_DONATIONS	0x60a06b2eee871e349331143ef173ecefd7a8ce01	537.338562
	from_funder_name	grant_pools	amount_in_usd
0	optimism	12	240450291.744
1	arbitrumfoundation	1	122850952.0
2	stellar	29	32989032.98
3	octant-golemfoundation	5	3965429.51329
4	dao-drops-dorgtech	1	250001.0
5	clrfund	1	83028.740386
	from_project_id	to_project_id	funder	project	amount
2142	Qgbm336fY9862LN2Czg3UX04A3p7I/79Bv2M4D61DAI=	8IKXraxq1pDuQD1xaDI20cjFrel55TZ/zf6LmP69qEg=	Gitcoin	efdevcon	13.531599
2143	Qgbm336fY9862LN2Czg3UX04A3p7I/79Bv2M4D61DAI=	79HQoZtyZftibazh6Yz63aU06XODWs7b/9h4JAqPa1s=	Gitcoin	LexDAO	86499.728685
2144	5Fgf9xv3CxTV+YbSShdY9XCJs7tgW8KNwQWq9rHUEsQ=	79HQoZtyZftibazh6Yz63aU06XODWs7b/9h4JAqPa1s=	clr.fund	LexDAO	193.952856
2145	Qgbm336fY9862LN2Czg3UX04A3p7I/79Bv2M4D61DAI=	yEebFy4M1iAdb9+YQmdssSx9Qf+ZXfSVguL/JyidngI=	Gitcoin	DeFiEye	224058.115245
2146	5Fgf9xv3CxTV+YbSShdY9XCJs7tgW8KNwQWq9rHUEsQ=	JQtLQErRk0u41xS292Cg+s3cRr8LaD5lQ2kME/Syp2Q=	clr.fund	Asilo Digital	703.639308