From 5c7ee33e7c22247aa2d09b68588b953c010dea78 Mon Sep 17 00:00:00 2001 From: ccerv1 Date: Thu, 5 Jun 2025 01:05:25 -0400 Subject: [PATCH 1/4] feat(nb): add funding metrics --- tutorials/FundingMetrics.ipynb | 1192 ++++++++++++++++++++++++++++++++ 1 file changed, 1192 insertions(+) create mode 100644 tutorials/FundingMetrics.ipynb diff --git a/tutorials/FundingMetrics.ipynb b/tutorials/FundingMetrics.ipynb new file mode 100644 index 00000000..d64c4876 --- /dev/null +++ b/tutorials/FundingMetrics.ipynb @@ -0,0 +1,1192 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "41b714c9-c749-4d0d-ad59-71e0c035d325", + "metadata": {}, + "outputs": [], + "source": [ + "# ! pip install pyoso" + ] + }, + { + "cell_type": "markdown", + "id": "413d143d-4494-4812-8cae-d28f47cc397e", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "Load environment variables, import necessary libraries, and initialize the OSO client" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "988cd219-8b29-469d-9e7a-46f7a965ddc7", + "metadata": {}, + "outputs": [], + "source": [ + "from dotenv import load_dotenv\n", + "import os\n", + "import pandas as pd\n", + "from pyoso import Client\n", + "\n", + "load_dotenv()\n", + "\n", + "OSO_API_KEY = os.environ['OSO_API_KEY']\n", + "client = Client(api_key=OSO_API_KEY)" + ] + }, + { + "cell_type": "markdown", + "id": "4231b926-89ab-48de-8746-b0d10f44c470", + "metadata": {}, + "source": [ + "## Testing\n", + "\n", + "Query the metrics table for all metric names containing '_funding_' and display them in alphabetical order" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "77fe6763-7d49-47a5-b5c6-68264034ab0c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
metric_name
0GITCOIN_DONATIONS_funding_awarded_biannually
1GITCOIN_DONATIONS_funding_awarded_daily
2GITCOIN_DONATIONS_funding_awarded_monthly
3GITCOIN_DONATIONS_funding_awarded_over_all_time
4GITCOIN_DONATIONS_funding_awarded_quarterly
5GITCOIN_DONATIONS_funding_awarded_weekly
6GITCOIN_DONATIONS_funding_awarded_yearly
7GITCOIN_MATCHING_funding_awarded_biannually
8GITCOIN_MATCHING_funding_awarded_daily
9GITCOIN_MATCHING_funding_awarded_monthly
10GITCOIN_MATCHING_funding_awarded_over_all_time
11GITCOIN_MATCHING_funding_awarded_quarterly
12GITCOIN_MATCHING_funding_awarded_weekly
13GITCOIN_MATCHING_funding_awarded_yearly
14OPEN_COLLECTIVE_funding_received_biannually
15OPEN_COLLECTIVE_funding_received_daily
16OPEN_COLLECTIVE_funding_received_monthly
17OPEN_COLLECTIVE_funding_received_over_all_time
18OPEN_COLLECTIVE_funding_received_quarterly
19OPEN_COLLECTIVE_funding_received_weekly
20OPEN_COLLECTIVE_funding_received_yearly
21OSS_FUNDING_funding_awarded_biannually
22OSS_FUNDING_funding_awarded_daily
23OSS_FUNDING_funding_awarded_monthly
24OSS_FUNDING_funding_awarded_over_all_time
25OSS_FUNDING_funding_awarded_quarterly
26OSS_FUNDING_funding_awarded_weekly
27OSS_FUNDING_funding_awarded_yearly
\n", + "
" + ], + "text/plain": [ + " metric_name\n", + "0 GITCOIN_DONATIONS_funding_awarded_biannually\n", + "1 GITCOIN_DONATIONS_funding_awarded_daily\n", + "2 GITCOIN_DONATIONS_funding_awarded_monthly\n", + "3 GITCOIN_DONATIONS_funding_awarded_over_all_time\n", + "4 GITCOIN_DONATIONS_funding_awarded_quarterly\n", + "5 GITCOIN_DONATIONS_funding_awarded_weekly\n", + "6 GITCOIN_DONATIONS_funding_awarded_yearly\n", + "7 GITCOIN_MATCHING_funding_awarded_biannually\n", + "8 GITCOIN_MATCHING_funding_awarded_daily\n", + "9 GITCOIN_MATCHING_funding_awarded_monthly\n", + "10 GITCOIN_MATCHING_funding_awarded_over_all_time\n", + "11 GITCOIN_MATCHING_funding_awarded_quarterly\n", + "12 GITCOIN_MATCHING_funding_awarded_weekly\n", + "13 GITCOIN_MATCHING_funding_awarded_yearly\n", + "14 OPEN_COLLECTIVE_funding_received_biannually\n", + "15 OPEN_COLLECTIVE_funding_received_daily\n", + "16 OPEN_COLLECTIVE_funding_received_monthly\n", + "17 OPEN_COLLECTIVE_funding_received_over_all_time\n", + "18 OPEN_COLLECTIVE_funding_received_quarterly\n", + "19 OPEN_COLLECTIVE_funding_received_weekly\n", + "20 OPEN_COLLECTIVE_funding_received_yearly\n", + "21 OSS_FUNDING_funding_awarded_biannually\n", + "22 OSS_FUNDING_funding_awarded_daily\n", + "23 OSS_FUNDING_funding_awarded_monthly\n", + "24 OSS_FUNDING_funding_awarded_over_all_time\n", + "25 OSS_FUNDING_funding_awarded_quarterly\n", + "26 OSS_FUNDING_funding_awarded_weekly\n", + "27 OSS_FUNDING_funding_awarded_yearly" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client.to_pandas(\"\"\"\n", + "SELECT metric_name\n", + "FROM metrics_v0\n", + "WHERE metric_name LIKE '%_funding_%'\n", + "ORDER BY 1\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "id": "e85f23d4-3605-40f6-8228-ce175de20f30", + "metadata": {}, + "source": [ + "## Aggregate funding metrics\n", + "\n", + "### By source\n", + "\n", + "We currently support CSV data uploads via [oss-funding](https://github.com/opensource-observer/oss-funding) and Gitcoin Grants. We also have Open Collective deposits, but they don't show up here (yet)." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "aa31b0d1-2117-4fb0-8b6e-f206500602d2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
metric_nametotal_amount_in_usd
0OSS_FUNDING_funding_awarded_over_all_time364887873.600968
1GITCOIN_MATCHING_funding_awarded_over_all_time13305117.158144
2GITCOIN_DONATIONS_funding_awarded_over_all_time11666103.711711
\n", + "
" + ], + "text/plain": [ + " metric_name total_amount_in_usd\n", + "0 OSS_FUNDING_funding_awarded_over_all_time 364887873.600968\n", + "1 GITCOIN_MATCHING_funding_awarded_over_all_time 13305117.158144\n", + "2 GITCOIN_DONATIONS_funding_awarded_over_all_time 11666103.711711" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client.to_pandas(\"\"\"\n", + "SELECT\n", + " m.metric_name,\n", + " SUM(km.amount) AS total_amount_in_usd\n", + "FROM key_metrics_by_project_v0 AS km\n", + "JOIN metrics_v0 AS m ON km.metric_id = m.metric_id\n", + "WHERE m.metric_name LIKE '%_funding_%'\n", + "GROUP BY 1\n", + "ORDER BY 2 DESC\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "id": "0d2902b8-468b-44ae-bae9-d797ec18de0d", + "metadata": {}, + "source": [ + "### To projects\n", + "\n", + "We can also see the largest project recipients with this query." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c9edc7a4-19d1-443c-bc01-ced2c6ffda65", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
project_display_nametotal_amount_in_usd
0GMX21000000.0
1MUX Protocol10876479.0
2Synthetix10022628.074157
3Perpetual Protocol9287212.140718
4Gains Network7898396.135
5Velodrome7895037.76024
6Camelot5407500.0
7Stargate Finance5289458.865658
8Vertex Protocol5250000.0
9Radiant4991077.0
\n", + "
" + ], + "text/plain": [ + " project_display_name total_amount_in_usd\n", + "0 GMX 21000000.0\n", + "1 MUX Protocol 10876479.0\n", + "2 Synthetix 10022628.074157\n", + "3 Perpetual Protocol 9287212.140718\n", + "4 Gains Network 7898396.135\n", + "5 Velodrome 7895037.76024\n", + "6 Camelot 5407500.0\n", + "7 Stargate Finance 5289458.865658\n", + "8 Vertex Protocol 5250000.0\n", + "9 Radiant 4991077.0" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client.to_pandas(\"\"\"\n", + "SELECT\n", + " p.display_name AS project_display_name,\n", + " SUM(km.amount) AS total_amount_in_usd\n", + "FROM key_metrics_by_project_v0 AS km\n", + "JOIN metrics_v0 AS m ON km.metric_id = m.metric_id\n", + "JOIN projects_v1 AS p ON km.project_id = p.project_id\n", + "WHERE m.metric_name LIKE '%_funding_awarded_over_all_time'\n", + "GROUP BY 1\n", + "ORDER BY 2 DESC\n", + "LIMIT 10\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "id": "0841ca11-505d-47bc-acdc-5ca8314af991", + "metadata": {}, + "source": [ + "### To projects from a specific source" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "9783c7e0-c2e0-4f1c-9dda-42d458cb39ce", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
project_display_nametotal_amount_in_usd
0Gitcoin1099895.038376
1Revoke748859.365745
2DefiLlama429924.507285
3Hey360529.24178
4JediSwap333277.670918
5Dark Forest332205.420888
6ZigZag Exchange210175.931949
7ethers.js190702.539836
8rotki174990.340416
9Taho170854.869607
\n", + "
" + ], + "text/plain": [ + " project_display_name total_amount_in_usd\n", + "0 Gitcoin 1099895.038376\n", + "1 Revoke 748859.365745\n", + "2 DefiLlama 429924.507285\n", + "3 Hey 360529.24178\n", + "4 JediSwap 333277.670918\n", + "5 Dark Forest 332205.420888\n", + "6 ZigZag Exchange 210175.931949\n", + "7 ethers.js 190702.539836\n", + "8 rotki 174990.340416\n", + "9 Taho 170854.869607" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client.to_pandas(\"\"\"\n", + "SELECT\n", + " p.display_name AS project_display_name,\n", + " SUM(km.amount) AS total_amount_in_usd\n", + "FROM key_metrics_by_project_v0 AS km\n", + "JOIN metrics_v0 AS m ON km.metric_id = m.metric_id\n", + "JOIN projects_v1 AS p ON km.project_id = p.project_id\n", + "WHERE m.metric_name = 'GITCOIN_DONATIONS_funding_awarded_over_all_time'\n", + "GROUP BY 1\n", + "ORDER BY 2 DESC\n", + "LIMIT 10\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "id": "807ac8df-5c90-4c14-821b-d29ebab1089c", + "metadata": {}, + "source": [ + "### To projects from a specific source and time frame" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "69f8d8ff-f902-4cce-bc3f-d23182f84de3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
project_display_nametotal_amount_in_usd
0Gitcoin797206.330376
1Dark Forest297517.115925
2ZigZag Exchange199746.433382
3ethers.js129500.966707
4Prysm Ethereum Client128522.705766
5rotki122666.927997
6ZeroPool116795.642612
7Lighthouse by Sigma Prime114759.839844
8The Tor Project110669.738113
9Hardhat110539.758225
\n", + "
" + ], + "text/plain": [ + " project_display_name total_amount_in_usd\n", + "0 Gitcoin 797206.330376\n", + "1 Dark Forest 297517.115925\n", + "2 ZigZag Exchange 199746.433382\n", + "3 ethers.js 129500.966707\n", + "4 Prysm Ethereum Client 128522.705766\n", + "5 rotki 122666.927997\n", + "6 ZeroPool 116795.642612\n", + "7 Lighthouse by Sigma Prime 114759.839844\n", + "8 The Tor Project 110669.738113\n", + "9 Hardhat 110539.758225" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client.to_pandas(\"\"\"\n", + "SELECT\n", + " p.display_name AS project_display_name,\n", + " SUM(tm.amount) AS total_amount_in_usd\n", + "FROM timeseries_metrics_by_project_v0 AS tm\n", + "JOIN metrics_v0 AS m ON tm.metric_id = m.metric_id\n", + "JOIN projects_v1 AS p ON tm.project_id = p.project_id\n", + "WHERE m.metric_name = 'GITCOIN_DONATIONS_funding_awarded_yearly'\n", + "AND tm.sample_date < DATE '2022-01-01'\n", + "GROUP BY 1\n", + "ORDER BY 2 DESC\n", + "LIMIT 10\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "id": "fe5ad20c-7772-4e94-83ec-77baea215d00", + "metadata": {}, + "source": [ + "## More granular analysis\n", + "\n", + "### Gitcoin\n", + "\n", + "Deep dive on Gitcoin grants to a specific project" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "60aa05c7-098f-4e6f-8183-a01fc5ed9652", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timeround_numberround_nameevent_sourcedonor_addressamount_in_usd
02023-02-09 15:58:07.078<NA>Gitcoin GrantsGITCOIN_DONATIONS0x386ea3171dcc9405311fd75b316cc2a87ecadeca617893.575
12023-08-29 08:32:57.00018Web3 Open Source SoftwareGITCOIN_MATCHING<NA>15001.1375
22024-11-25 14:26:59.00022GG22 OSS - dApps and AppsGITCOIN_MATCHING<NA>14984.28125
32024-05-31 14:35:02.00020dApps & AppsGITCOIN_MATCHING<NA>14979.978125
42023-11-29 20:18:47.00019Web3 Open Source SoftwareGITCOIN_MATCHING<NA>14849.591509
52022-08-24 00:00:00.00015<NA>GITCOIN_MATCHING<NA>12500.0
62024-08-26 15:19:00.00021GG21: Thriving Arbitrum SummerGITCOIN_MATCHING<NA>9839.680095
72022-08-24 00:00:00.00015<NA>GITCOIN_MATCHING<NA>7410.488854
82024-05-07 10:04:49.00020dApps & AppsGITCOIN_DONATIONS0xe2a26d5174b133abc4b338df1b07295f03a4c85e1000.42865
92024-05-06 17:29:47.00020dApps & AppsGITCOIN_DONATIONS0x60a06b2eee871e349331143ef173ecefd7a8ce01537.338562
\n", + "
" + ], + "text/plain": [ + " time round_number round_name \\\n", + "0 2023-02-09 15:58:07.078 Gitcoin Grants \n", + "1 2023-08-29 08:32:57.000 18 Web3 Open Source Software \n", + "2 2024-11-25 14:26:59.000 22 GG22 OSS - dApps and Apps \n", + "3 2024-05-31 14:35:02.000 20 dApps & Apps \n", + "4 2023-11-29 20:18:47.000 19 Web3 Open Source Software \n", + "5 2022-08-24 00:00:00.000 15 \n", + "6 2024-08-26 15:19:00.000 21 GG21: Thriving Arbitrum Summer \n", + "7 2022-08-24 00:00:00.000 15 \n", + "8 2024-05-07 10:04:49.000 20 dApps & Apps \n", + "9 2024-05-06 17:29:47.000 20 dApps & Apps \n", + "\n", + " event_source donor_address \\\n", + "0 GITCOIN_DONATIONS 0x386ea3171dcc9405311fd75b316cc2a87ecadeca \n", + "1 GITCOIN_MATCHING \n", + "2 GITCOIN_MATCHING \n", + "3 GITCOIN_MATCHING \n", + "4 GITCOIN_MATCHING \n", + "5 GITCOIN_MATCHING \n", + "6 GITCOIN_MATCHING \n", + "7 GITCOIN_MATCHING \n", + "8 GITCOIN_DONATIONS 0xe2a26d5174b133abc4b338df1b07295f03a4c85e \n", + "9 GITCOIN_DONATIONS 0x60a06b2eee871e349331143ef173ecefd7a8ce01 \n", + "\n", + " amount_in_usd \n", + "0 617893.575 \n", + "1 15001.1375 \n", + "2 14984.28125 \n", + "3 14979.978125 \n", + "4 14849.591509 \n", + "5 12500.0 \n", + "6 9839.680095 \n", + "7 7410.488854 \n", + "8 1000.42865 \n", + "9 537.338562 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client.to_pandas(\"\"\"\n", + " SELECT\n", + " time,\n", + " round_number,\n", + " round_name,\n", + " event_source,\n", + " donor_address,\n", + " amount_in_usd\n", + " FROM int_events__gitcoin_funding\n", + " WHERE gitcoin_group_project_name = 'revokecash'\n", + " ORDER BY amount_in_usd DESC\n", + " LIMIT 10\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "id": "144e833e-cf24-4659-a43b-35505d95501a", + "metadata": {}, + "source": [ + "## OSS Funding\n", + "\n", + "Overview of specific funders and grant pools in oss-funding data" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "fa50c103-7c3c-4fb2-bcb1-ff754a8b53f6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
from_funder_namegrant_poolsamount_in_usd
0optimism12240450291.744
1arbitrumfoundation1122850952.0
2stellar2932989032.98
3octant-golemfoundation53965429.51329
4dao-drops-dorgtech1250001.0
5clrfund183028.740386
\n", + "
" + ], + "text/plain": [ + " from_funder_name grant_pools amount_in_usd\n", + "0 optimism 12 240450291.744\n", + "1 arbitrumfoundation 1 122850952.0\n", + "2 stellar 29 32989032.98\n", + "3 octant-golemfoundation 5 3965429.51329\n", + "4 dao-drops-dorgtech 1 250001.0\n", + "5 clrfund 1 83028.740386" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client.to_pandas(\"\"\"\n", + "SELECT\n", + " from_funder_name,\n", + " COUNT(DISTINCT grant_pool_name) AS grant_pools,\n", + " SUM(amount) AS amount_in_usd\n", + "FROM stg_ossd__current_funding\n", + "GROUP BY 1\n", + "ORDER BY 3 DESC\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "id": "f7aaf46c-45b8-444e-a6f1-27286085a463", + "metadata": {}, + "source": [ + "## Funding flows\n", + "\n", + "We can use this to construct a simple sankey diagram of funding flows" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "1bd3a7a8-8cac-420e-9e7b-5259ff0e3b2d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
from_project_idto_project_idfunderprojectamount
2142Qgbm336fY9862LN2Czg3UX04A3p7I/79Bv2M4D61DAI=8IKXraxq1pDuQD1xaDI20cjFrel55TZ/zf6LmP69qEg=Gitcoinefdevcon13.531599
2143Qgbm336fY9862LN2Czg3UX04A3p7I/79Bv2M4D61DAI=79HQoZtyZftibazh6Yz63aU06XODWs7b/9h4JAqPa1s=GitcoinLexDAO86499.728685
21445Fgf9xv3CxTV+YbSShdY9XCJs7tgW8KNwQWq9rHUEsQ=79HQoZtyZftibazh6Yz63aU06XODWs7b/9h4JAqPa1s=clr.fundLexDAO193.952856
2145Qgbm336fY9862LN2Czg3UX04A3p7I/79Bv2M4D61DAI=yEebFy4M1iAdb9+YQmdssSx9Qf+ZXfSVguL/JyidngI=GitcoinDeFiEye224058.115245
21465Fgf9xv3CxTV+YbSShdY9XCJs7tgW8KNwQWq9rHUEsQ=JQtLQErRk0u41xS292Cg+s3cRr8LaD5lQ2kME/Syp2Q=clr.fundAsilo Digital703.639308
\n", + "
" + ], + "text/plain": [ + " from_project_id \\\n", + "2142 Qgbm336fY9862LN2Czg3UX04A3p7I/79Bv2M4D61DAI= \n", + "2143 Qgbm336fY9862LN2Czg3UX04A3p7I/79Bv2M4D61DAI= \n", + "2144 5Fgf9xv3CxTV+YbSShdY9XCJs7tgW8KNwQWq9rHUEsQ= \n", + "2145 Qgbm336fY9862LN2Czg3UX04A3p7I/79Bv2M4D61DAI= \n", + "2146 5Fgf9xv3CxTV+YbSShdY9XCJs7tgW8KNwQWq9rHUEsQ= \n", + "\n", + " to_project_id funder project \\\n", + "2142 8IKXraxq1pDuQD1xaDI20cjFrel55TZ/zf6LmP69qEg= Gitcoin efdevcon \n", + "2143 79HQoZtyZftibazh6Yz63aU06XODWs7b/9h4JAqPa1s= Gitcoin LexDAO \n", + "2144 79HQoZtyZftibazh6Yz63aU06XODWs7b/9h4JAqPa1s= clr.fund LexDAO \n", + "2145 yEebFy4M1iAdb9+YQmdssSx9Qf+ZXfSVguL/JyidngI= Gitcoin DeFiEye \n", + "2146 JQtLQErRk0u41xS292Cg+s3cRr8LaD5lQ2kME/Syp2Q= clr.fund Asilo Digital \n", + "\n", + " amount \n", + "2142 13.531599 \n", + "2143 86499.728685 \n", + "2144 193.952856 \n", + "2145 224058.115245 \n", + "2146 703.639308 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query = \"\"\"\n", + "SELECT\n", + " fp.project_id AS from_project_id,\n", + " tp.project_id AS to_project_id,\n", + " fp.display_name AS funder,\n", + " tp.display_name AS project,\n", + " SUM(e.amount) AS amount\n", + "FROM int_events_daily__funding AS e\n", + "JOIN artifacts_by_project_v1 AS fa\n", + " ON e.from_artifact_id = fa.artifact_id\n", + "JOIN artifacts_by_project_v1 AS ta\n", + " ON e.to_artifact_id = ta.artifact_id\n", + "JOIN projects_v1 AS fp\n", + " ON fa.project_id = fp.project_id\n", + "JOIN projects_v1 AS tp\n", + " ON ta.project_id = tp.project_id\n", + "GROUP BY 1,2,3,4\n", + "\"\"\"\n", + "df = client.to_pandas(query)\n", + "df.tail()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From a705d367cb6b01f3aa01e639d06f317bba5a557c Mon Sep 17 00:00:00 2001 From: ccerv1 Date: Thu, 5 Jun 2025 09:59:51 -0400 Subject: [PATCH 2/4] feat: ef repo clustering agent --- experiments/ethereum-repo-clusters/.gitignore | 38 ++ experiments/ethereum-repo-clusters/README.md | 331 ++++++++++++++++++ .../ethereum-repo-clusters/__init__.py | 0 .../ethereum-repo-clusters/__main__.py | 9 + .../ethereum-repo-clusters/cli/__init__.py | 7 + .../ethereum-repo-clusters/cli/main_cli.py | 242 +++++++++++++ .../ethereum-repo-clusters/config/__init__.py | 0 .../config/config_manager.py | 139 ++++++++ .../config/prompts/__init__.py | 11 + .../config/prompts/categories.py | 106 ++++++ .../config/prompts/personas.py | 65 ++++ .../config/prompts/summary_prompts.py | 32 ++ .../ethereum-repo-clusters/config/settings.py | 26 ++ .../pipeline/__init__.py | 15 + .../pipeline/categorizer.py | 172 +++++++++ .../pipeline/consolidator.py | 183 ++++++++++ .../pipeline/data_manager.py | 292 +++++++++++++++ .../pipeline/repository_fetcher.py | 125 +++++++ .../pipeline/summary_generator.py | 131 +++++++ .../pipeline/unified_processor.py | 279 +++++++++++++++ .../processing/__init__.py | 10 + .../processing/ai_service.py | 263 ++++++++++++++ .../processing/fetcher.py | 135 +++++++ .../pipeline_config.json | 10 + .../ethereum-repo-clusters/requirements.txt | 8 + experiments/ethereum-repo-clusters/setup.py | 16 + 26 files changed, 2645 insertions(+) create mode 100644 experiments/ethereum-repo-clusters/.gitignore create mode 100644 experiments/ethereum-repo-clusters/README.md create mode 100644 experiments/ethereum-repo-clusters/ethereum-repo-clusters/__init__.py create mode 100644 experiments/ethereum-repo-clusters/ethereum-repo-clusters/__main__.py create mode 100644 experiments/ethereum-repo-clusters/ethereum-repo-clusters/cli/__init__.py create mode 100644 experiments/ethereum-repo-clusters/ethereum-repo-clusters/cli/main_cli.py create mode 100644 experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/__init__.py create mode 100644 experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/config_manager.py create mode 100644 experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/__init__.py create mode 100644 experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/categories.py create mode 100644 experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/personas.py create mode 100644 experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/summary_prompts.py create mode 100644 experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/settings.py create mode 100644 experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/__init__.py create mode 100644 experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/categorizer.py create mode 100644 experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/consolidator.py create mode 100644 experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/data_manager.py create mode 100644 experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/repository_fetcher.py create mode 100644 experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/summary_generator.py create mode 100644 experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/unified_processor.py create mode 100644 experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/__init__.py create mode 100644 experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/ai_service.py create mode 100644 experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/fetcher.py create mode 100644 experiments/ethereum-repo-clusters/pipeline_config.json create mode 100644 experiments/ethereum-repo-clusters/requirements.txt create mode 100644 experiments/ethereum-repo-clusters/setup.py diff --git a/experiments/ethereum-repo-clusters/.gitignore b/experiments/ethereum-repo-clusters/.gitignore new file mode 100644 index 00000000..58ae5f3c --- /dev/null +++ b/experiments/ethereum-repo-clusters/.gitignore @@ -0,0 +1,38 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Environment +.env +.venv +env/ +venv/ +ENV/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo + +# Project specific +data/ +output/ \ No newline at end of file diff --git a/experiments/ethereum-repo-clusters/README.md b/experiments/ethereum-repo-clusters/README.md new file mode 100644 index 00000000..3a60aa1b --- /dev/null +++ b/experiments/ethereum-repo-clusters/README.md @@ -0,0 +1,331 @@ +# Ethereum Repo Clusters + +A Python package for automatically clustering Ethereum development tools and libraries based on their README content using AI-driven analysis and multiple personas. + +## Overview + +This project implements a pipeline to: +1. Fetch repository data from the OSO (Open Source Observer) database. +2. Retrieve corresponding README files from GitHub. +3. Generate concise project summaries using Google's Gemini AI. +4. Employ multiple configurable AI personas to categorize each project based on its summary and metadata. +5. Consolidate these categorizations, using a star-count weighted approach for projects with multiple repositories, to produce a final recommended category. + +The entire process is managed via a Command Line Interface (CLI). + +## Features + +- Fetches comprehensive repository data via OSO, including fork status and activity tracking. +- Retrieves and processes README.md files from GitHub with robust error handling. +- Utilizes Google's Gemini AI for intelligent summary generation. +- Employs a multi-persona approach for nuanced project categorization. +- Supports an arbitrary number of configurable AI personas. +- Calculates final project recommendations using star-count weighted consolidation. +- Offers both modular pipeline and unified processing approaches. +- Provides detailed tracking of repository status (active/inactive, fork/non-fork). +- Handles empty or error READMEs gracefully with "UNCATEGORIZED" status. +- Includes timestamps for all categorization operations. +- Test mode for quick runs on a subset of data. +- Outputs data at various stages in Parquet and CSV formats (with README text removed from CSV for readability). +- Supports easy resumption of processing and addition of new repositories. +- Features comprehensive progress bars at multiple levels for better visibility into processing status. + +## Prerequisites + +- Python 3.10+ +- Access to OSO, GitHub, and Google Gemini APIs. + +## Installation + +1. **Clone the repository:** + ```bash + git clone + cd ethereum-repo-clusters + ``` + +2. **Set up a virtual environment (recommended):** + ```bash + python -m venv venv + source venv/bin/activate # On Windows use `venv\Scripts\activate` + ``` + +3. **Install dependencies:** + ```bash + pip install -r requirements.txt + ``` + +4. **Install the package in editable mode (optional, for development):** + ```bash + pip install -e . + ``` + +5. **Create a `.env` file** in the project root directory (`ethereum-repo-clusters/`) and add your API keys: + ```env + OSO_API_KEY="your_oso_api_key" + GITHUB_TOKEN="your_github_token" # A GitHub Personal Access Token with repo access + GEMINI_API_KEY="your_gemini_api_key" + ``` + These keys are loaded via `ethereum-repo-clusters/config/settings.py`. + +## Configuration + +The project uses a combination of a JSON configuration file and Python modules for settings: + +- **`pipeline_config.json`**: + - Located at the project root. + - Controls operational settings like `output_dir`, `test_mode`, `test_mode_limit`, AI model name (`gemini_model`), and batch sizes for AI processing. + - If this file is missing, it will be automatically created with default values on the first run. + - Values in this file override defaults sourced from Python modules. + +- **AI Personas (`ethereum-repo-clusters/config/prompts/personas.py`):** + - Define the different AI personas used for categorization. + - Each persona is a dictionary with `name`, `title`, `description`, and a `prompt` template. + - Modify this Python list directly to add, remove, or change personas. + +- **Categories (`ethereum-repo-clusters/config/prompts/categories.py`):** + - Defines the list of possible categories projects can be assigned to. + - Includes `CATEGORIES` (list of dicts with `category` and `description`) and `CATEGORY_NAMES` (a simple list of category names). + - Edit this file to update the categorization taxonomy. + +- **Prompt Templates (`ethereum-repo-clusters/config/prompts/summary_prompts.py`):** + - Contains `SUMMARY_PROMPT` (for generating project summaries) and `TAGS_PROMPT` (for an auxiliary tag generation, currently not central to categorization). + - These are used by the `AIService`. + +- **Core Settings (`ethereum-repo-clusters/config/settings.py`):** + - Loads API keys from the `.env` file. + - Defines default values for `GEMINI_MODEL` and `OUTPUT_DIR` if not specified in `pipeline_config.json`. + +## Usage (CLI) + +The project is operated via the command line using `python -m ethereum-repo-clusters`. + +**General Command Structure:** +```bash +python -m ethereum-repo-clusters [GLOBAL_OPTIONS] COMMAND [COMMAND_OPTIONS] +``` + +**Global Options:** +- `--test-mode`: Runs the specified command(s) in test mode, processing a limited number of repositories (defined by `test_mode_limit` in `pipeline_config.json`, sorted by stars). + +**Main Commands:** + +- **`fetch_repos`**: Fetches repository data from OSO and READMEs from GitHub. + ```bash + python -m ethereum-repo-clusters fetch_repos + ``` + - `--force-refresh`: Wipes existing raw repository data and re-fetches. + - `--fetch-new-only`: Only fetches repositories that don't exist in current data. + +- **`generate_summaries`**: Generates AI summaries for fetched repositories. + ```bash + python -m ethereum-repo-clusters generate_summaries + ``` + - `--force-refresh`: Wipes existing summaries and regenerates them. + - `--new-only`: Only generates summaries for repositories that don't have summaries yet. + +- **`categorize`**: Categorizes projects using all defined AI personas. + ```bash + python -m ethereum-repo-clusters categorize + ``` + - `--force-refresh`: Wipes existing categorizations and re-runs. + - `--persona `: Processes only the specified persona. Can be combined with `--force-refresh`. Example: + ```bash + python -m ethereum-repo-clusters categorize --persona keyword_spotter --force-refresh + ``` + - `--new-only`: Only categorizes repositories that don't have categories yet. + +- **`consolidate`**: Consolidates categorizations from all personas and generates final project recommendations. + ```bash + python -m ethereum-repo-clusters consolidate + ``` + *(This step does not typically require a force-refresh as it always processes the latest categorized data.)* + +**Persona Management (Informational):** +The CLI includes commands related to personas, but due to refactoring, persona definitions are now managed directly in `ethereum-repo-clusters/config/prompts/personas.py`. These CLI commands are informational: + +- `python -m ethereum-repo-clusters personas list`: Lists personas currently defined in `personas.py`. +- `python -m ethereum-repo-clusters personas add ...`: Provides instructions on how to add a persona by editing `personas.py`. +- `python -m ethereum-repo-clusters personas remove `: Provides instructions on how to remove a persona by editing `personas.py`. + +**Example Full Run in Test Mode with Full Refresh:** +```bash +# Legacy pipeline approach +python -m ethereum-repo-clusters --test-mode run_all --force-refresh-all + +# New unified processor approach (recommended) +python -m ethereum-repo-clusters --test-mode run_all --force-refresh-all --use-unified +``` + +## Workflow + +### Legacy Pipeline (Step-by-Step) + +1. **Fetch Data (`fetch_repos`):** + - Repository metadata is fetched from OSO. + - README.md content is fetched from GitHub for these repositories. + - Output: `output/devtooling_raw.parquet` + +2. **Generate Summaries (`generate_summaries`):** + - READMEs are processed by Gemini AI to create concise summaries. + - Output: `output/devtooling_summarized.parquet` + +3. **Categorize by Persona (`categorize`):** + - Each project summary (with metadata) is evaluated by every defined AI persona. + - Each persona assigns a category based on its specific prompt and the global category list. + - Output: Individual Parquet files per persona in `output/categorized/` (e.g., `output/categorized/keyword_spotter.parquet`). + +4. **Consolidate Recommendations (`consolidate`):** + - Categorizations from all personas are merged. + - For each project: + - If it's a single-repository project, the recommendation is based on a star-weighted aggregation of persona assignments for that repo. + - If it's a multi-repository project, the recommendation is determined by a star-count weighted aggregation of all persona assignments across all its repositories. The category with the highest total star weight wins. + - Output: `output/devtooling_full.parquet` and `output/devtooling_consolidated.csv`. + +### New Unified Processor (Recommended) + +The new unified processor combines all steps into a single efficient pipeline: + +1. **Process Repositories (`process_unified`):** + - Repository metadata is fetched from OSO, including fork status and activity tracking. + - README.md content is fetched from GitHub with robust error handling. + - For each repository with a valid README: + - A summary is generated immediately. + - All personas categorize the repository in sequence. + - Results are stored with timestamps for each operation. + - For repositories with empty or error READMEs: + - Status is tracked as "EMPTY" or "ERROR". + - All categorizations are marked as "UNCATEGORIZED". + - A final recommendation is determined based on the most common category across personas. + - Output: `output/ethereum_repos_unified.parquet` and `output/ethereum_repos_unified.csv`. + +The unified processor offers several advantages: +- Single pass through repositories (more efficient) +- Better error handling and status tracking +- Easier to resume processing or add new repositories +- Comprehensive data structure with all information in one place +- Timestamps for all operations for better traceability +- Detailed progress bars for tracking processing status at multiple levels +- CSV output with README text removed for improved readability + +## Output Files + +All output data is stored in the directory specified by `output_dir` in `pipeline_config.json` (default is `output/`). + +### Legacy Pipeline Output + +- **`devtooling_raw.parquet`**: Raw data fetched from OSO, augmented with GitHub README content. +- **`devtooling_summarized.parquet`**: Repositories with their AI-generated summaries. +- **`categorized/.parquet`**: Dataframe for each persona, containing the original summary data plus that persona's assigned category and reason. +- **`devtooling_full.parquet`**: The final consolidated dataset, with one row per project, including the overall recommendation, total stars, repo count, sample summary, and individual persona category modes. +- **`devtooling_consolidated.csv`**: A CSV version of the final consolidated data for easier viewing. + +### Unified Processor Output + +- **`ethereum_repos_unified.parquet`**: Comprehensive dataset containing all repositories with their metadata, summaries, and categorizations in a single structure. +- **`ethereum_repos_unified.csv`**: A CSV version of the unified data for easier viewing, with README text removed and long text fields truncated for readability. + +### Unified Data Structure + +The unified processor creates a comprehensive data structure with the following key fields: + +```json +{ + "repo_artifact_id": "...", + "project_id": "...", + "repo_artifact_namespace": "...", + "repo_artifact_name": "...", + "is_fork": true/false, + "is_actively_maintained": true/false, + "last_updated": "2024-12-01", + "star_count": 100, + "readme_status": "SUCCESS/EMPTY/ERROR", + "summary": "...", + "categorizations": [ + { + "persona_name": "keyword_spotter", + "category": "Developer Tools", + "reason": "Contains keywords like 'CLI', 'build tool'...", + "timestamp": "2025-01-05T09:15:00Z" + }, + { + "persona_name": "senior_strategist", + "category": "Infrastructure", + "reason": "Mature project with strong adoption...", + "timestamp": "2025-01-05T09:15:01Z" + }, + { + "persona_name": "workflow_wizard", + "category": "Developer Tools", + "reason": "Streamlines development workflow...", + "timestamp": "2025-01-05T09:15:02Z" + } + ], + "final_recommendation": "Developer Tools", + "processing_timestamp": "2025-01-05T09:15:02Z" +} +``` + +This structure makes it easy to: +- Track which repositories have been processed +- Identify repositories with errors or empty READMEs +- See the categorization from each persona with timestamps +- Filter repositories by fork status or activity +- Resume processing from where you left off + +## Development Notes +- The project uses `tqdm` for progress bars during long operations, with detailed progress tracking at multiple levels: + - Overall batch processing + - Repository processing within each batch + - README fetching for each repository + - Categorization with each persona +- `DataManager` class in `ethereum-repo-clusters/pipeline/data_manager.py` handles all data persistence (reading/writing Parquet files). +- `AIService` in `ethereum-repo-clusters/processing/ai_service.py` abstracts interactions with the Gemini API. +- `UnifiedProcessor` in `ethereum-repo-clusters/pipeline/unified_processor.py` provides the new streamlined processing approach. +- The CLI in `ethereum-repo-clusters/cli/main_cli.py` supports both legacy and unified processing approaches. +- Output files are saved to the local `output/` directory in the current repository. + +## New CLI Commands + +### Unified Processing + +```bash +# Process repositories with the unified processor +python -m ethereum-repo-clusters process_unified [OPTIONS] + +# Options: +# --force-refresh Force refresh all data, ignoring existing. +# --include-forks Include forked repositories in processing. +# --include-inactive Include repositories not updated in the last year. +# --limit INTEGER Limit the number of repositories to process. +``` + +### Run All with Unified Processor + +```bash +# Run the entire pipeline using the unified processor +python -m ethereum-repo-clusters run_all --use-unified [OPTIONS] + +# Additional options with --use-unified: +# --include-forks Include forked repositories in processing. +# --include-inactive Include repositories not updated in the last year. +``` + +## Adding New Repositories + +To add new repositories to the analysis: + +1. The unified processor automatically detects which repositories have already been processed. +2. New repositories from OSO will be processed automatically on the next run. +3. To add repositories manually, you can: + - Update the OSO query in `fetcher.py` to include additional repositories. + - Create a custom script that adds repositories to the unified data structure. + +## Error Handling + +The unified processor handles errors gracefully: + +- Empty READMEs: Marked with `readme_status="EMPTY"` and categorized as "UNCATEGORIZED". +- Error fetching README: Marked with `readme_status="ERROR"` and categorized as "UNCATEGORIZED". +- API errors during categorization: The specific persona's categorization is marked as "UNCATEGORIZED" with the error reason. + +This approach ensures that all repositories are included in the final output, even if they couldn't be fully processed. diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/__init__.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/__main__.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/__main__.py new file mode 100644 index 00000000..4bedcbac --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/__main__.py @@ -0,0 +1,9 @@ +from .cli.main_cli import cli + +def main(): + # The obj={} is a way to initialize Click's context object + # if it's not being run directly by the `click` runner (e.g. `python -m devtooling_labels`) + cli(obj={}) + +if __name__ == "__main__": + main() diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/cli/__init__.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/cli/__init__.py new file mode 100644 index 00000000..986cde36 --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/cli/__init__.py @@ -0,0 +1,7 @@ +# This file makes the 'cli' directory a Python package. + +from .main_cli import cli + +__all__ = [ + "cli" +] diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/cli/main_cli.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/cli/main_cli.py new file mode 100644 index 00000000..9661a7cb --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/cli/main_cli.py @@ -0,0 +1,242 @@ +import click +from pathlib import Path + +from ..config.config_manager import ConfigManager +from ..pipeline.data_manager import DataManager +from ..processing.ai_service import AIService +from ..pipeline.repository_fetcher import RepositoryFetcherStep +from ..pipeline.summary_generator import SummaryGeneratorStep +from ..pipeline.categorizer import CategorizerStep +from ..pipeline.consolidator import ConsolidatorStep +from ..pipeline.unified_processor import UnifiedProcessor + +# Initialize ConfigManager globally or pass as context +# For simplicity here, we'll initialize it where needed or once at the top. +# A more robust Click app might use a context object. +config_manager = ConfigManager() # Loads default or existing pipeline_config.json + +@click.group() +@click.option('--test-mode', is_flag=True, help='Run in test mode (limits fetched repos, uses test_mode_limit from config).') +@click.pass_context +def cli(ctx, test_mode): + """DevTooling Labels CLI for processing and categorizing repositories.""" + ctx.ensure_object(dict) + + # Update config if test_mode flag is set via CLI + # This overrides the value in pipeline_config.json for this run + if test_mode: + config_manager.set("test_mode", True) + # No need to save if it's a per-run override. + # If we want to persist it: config_manager.save_config() + print(f"CLI flag --test-mode is set. Running in test mode. Limit: {config_manager.get_test_mode_limit()} repos.") + else: + # If not set by CLI, respect the config file's test_mode setting + # Or, explicitly set to False if CLI should always override to False when flag not present + # config_manager.set("test_mode", False) # Uncomment if CLI flag absence means test_mode is OFF + pass # Current behavior: respects config file if CLI flag is absent. + + # Initialize services and pass them via context if needed by multiple commands + # Or initialize them within each command + output_dir = config_manager.get_output_dir() + data_manager = DataManager(output_dir=output_dir, config=config_manager) + ai_service = AIService(config_manager=config_manager) + + ctx.obj['config_manager'] = config_manager + ctx.obj['data_manager'] = data_manager + ctx.obj['ai_service'] = ai_service + ctx.obj['output_dir'] = output_dir + + +@cli.command("fetch_repos") +@click.option('--force-refresh', is_flag=True, help='Force refresh repository data, ignoring existing.') +@click.option('--fetch-new-only', is_flag=True, help='Only fetch repositories that don\'t exist in current data.') +@click.pass_context +def fetch_repos_command(ctx, force_refresh, fetch_new_only): + """Fetches repositories and their READMEs.""" + print("Executing: Fetch Repositories") + data_manager = ctx.obj['data_manager'] + # ConfigManager is already aware of test_mode from the group command + config_mgr = ctx.obj['config_manager'] + + repo_fetcher_step = RepositoryFetcherStep(data_manager=data_manager, config_manager=config_mgr) + repo_fetcher_step.run(force_refresh=force_refresh, fetch_new_only=fetch_new_only) + print("Repository fetching complete.") + + +@cli.command("generate_summaries") +@click.option('--force-refresh', is_flag=True, help='Force refresh summaries, ignoring existing.') +@click.option('--new-only', is_flag=True, help='Generate summaries only for repositories that don\'t have summaries yet.') +@click.pass_context +def generate_summaries_command(ctx, force_refresh, new_only): + """Generates summaries for the fetched repositories.""" + print("Executing: Generate Summaries") + data_manager = ctx.obj['data_manager'] + config_mgr = ctx.obj['config_manager'] + ai_service = ctx.obj['ai_service'] + + summary_generator_step = SummaryGeneratorStep( + data_manager=data_manager, + config_manager=config_mgr, + ai_service=ai_service + ) + summary_generator_step.run(force_refresh=force_refresh, new_only=new_only) + print("Summary generation complete.") + + +@cli.command("categorize") +@click.option('--force-refresh', is_flag=True, help='Force refresh categories, ignoring existing.') +@click.option('--persona', help='Process only the specified persona.') +@click.option('--new-only', is_flag=True, help='Categorize only repositories that don\'t have categories yet.') +@click.pass_context +def categorize_command(ctx, force_refresh, persona, new_only): + """Categorizes projects using AI personas.""" + print("Executing: Categorize") + data_manager = ctx.obj['data_manager'] + config_mgr = ctx.obj['config_manager'] + ai_service = ctx.obj['ai_service'] + + categorizer_step = CategorizerStep( + data_manager=data_manager, + config_manager=config_mgr, + ai_service=ai_service + ) + categorizer_step.run(force_refresh=force_refresh, target_persona_name=persona, new_only=new_only) + print("Categorization complete.") + + +@cli.command("consolidate") +@click.pass_context +def consolidate_command(ctx): + """Consolidates categorizations and generates final recommendations.""" + print("Executing: Consolidate Analysis") + data_manager = ctx.obj['data_manager'] + config_mgr = ctx.obj['config_manager'] + + consolidator_step = ConsolidatorStep(data_manager=data_manager, config_manager=config_mgr) + consolidator_step.run() + print("Consolidation complete.") + + +@cli.command("process_unified") +@click.option('--force-refresh', is_flag=True, help='Force refresh all data, ignoring existing.') +@click.option('--include-forks', is_flag=True, help='Include forked repositories in processing.') +@click.option('--include-inactive', is_flag=True, help='Include repositories not updated in the last year.') +@click.option('--limit', type=int, help='Limit the number of repositories to process.') +@click.pass_context +def process_unified_command(ctx, force_refresh, include_forks, include_inactive, limit): + """ + Unified processing: fetches repos, READMEs, generates summaries, and categorizes in one pass. + Outputs a single comprehensive dataset with all information. + """ + print("Executing: Unified Processing Pipeline") + data_manager = ctx.obj['data_manager'] + config_mgr = ctx.obj['config_manager'] + ai_service = ctx.obj['ai_service'] + + processor = UnifiedProcessor( + data_manager=data_manager, + config_manager=config_mgr, + ai_service=ai_service + ) + + processor.run( + force_refresh=force_refresh, + include_forks=include_forks, + inactive_repos=include_inactive, + limit=limit + ) + + print("Unified processing complete.") + print(f"Results saved to:") + print(f" - {data_manager.unified_parquet_path} (Parquet format)") + print(f" - {data_manager.unified_csv_path} (CSV format)") + + +@cli.command("run_all") +@click.option('--force-refresh-all', is_flag=True, help='Force refresh all data stages.') +@click.option('--force-refresh-repos', is_flag=True, help='Force refresh repository data.') +@click.option('--force-refresh-summaries', is_flag=True, help='Force refresh summaries.') +@click.option('--force-refresh-categories', is_flag=True, help='Force refresh categories.') +@click.option('--use-unified', is_flag=True, help='Use the new unified processor instead of the legacy pipeline.') +@click.option('--include-forks', is_flag=True, help='Include forked repositories (only with --use-unified).') +@click.option('--include-inactive', is_flag=True, help='Include inactive repositories (only with --use-unified).') +@click.pass_context +def run_all_command(ctx, force_refresh_all, force_refresh_repos, force_refresh_summaries, + force_refresh_categories, use_unified, include_forks, include_inactive): + """Runs the entire pipeline: either legacy steps or the new unified processor.""" + + if use_unified: + print("Executing: Run All Using Unified Processor") + ctx.invoke( + process_unified_command, + force_refresh=force_refresh_all, + include_forks=include_forks, + include_inactive=include_inactive, + limit=None + ) + else: + print("Executing: Run All Pipeline Steps (Legacy)") + # Determine force_refresh flags for each step + fr_repos = force_refresh_all or force_refresh_repos + fr_summaries = force_refresh_all or force_refresh_summaries + fr_categories = force_refresh_all or force_refresh_categories + + # Invoke other commands with determined force_refresh settings + # The --test-mode flag from the main group is implicitly handled by ConfigManager + ctx.invoke(fetch_repos_command, force_refresh=fr_repos) + ctx.invoke(generate_summaries_command, force_refresh=fr_summaries) + ctx.invoke(categorize_command, force_refresh=fr_categories, persona=None, new_only=False) # Process all personas + ctx.invoke(consolidate_command) + + print("Full pipeline execution complete.") + +# Commands for managing personas in config +@cli.group("personas") +def personas_group(): + """Manage AI personas in the configuration.""" + pass + +@personas_group.command("list") +@click.pass_context +def list_personas(ctx): + """Lists all configured personas.""" + config_mgr = ctx.obj['config_manager'] + personas = config_mgr.get_personas() + if not personas: + print("No personas configured.") + return + print("Configured Personas:") + for p in personas: + print(f"- Name: {p['name']}, Title: {p.get('title', 'N/A')}") + +@personas_group.command("add") +@click.option('--name', required=True, help="Unique name for the persona.") +@click.option('--title', required=True, help="Display title for the persona.") +@click.option('--description', required=True, help="Description of the persona's focus.") +@click.option('--prompt-template', required=True, help="Prompt template for the persona's classification task.") +@click.pass_context +def add_persona(ctx, name, title, description, prompt_template): + """Adds a new persona to the configuration.""" + config_mgr = ctx.obj['config_manager'] + new_persona = { + "name": name, + "title": title, + "description": description, + "prompt": prompt_template + } + # config_mgr.add_persona(new_persona) # This method was removed as personas are managed in personas.py + print(f"Persona management is now done by editing devtooling_labels/config/prompts/personas.py. '{name}' was not added via CLI.") + print("To add a persona, please edit the personas.py file directly.") + +@personas_group.command("remove") +@click.argument('name') +@click.pass_context +def remove_persona(ctx, name): + """Removes a persona by name. (Note: Persona management is now via personas.py)""" + # config_mgr = ctx.obj['config_manager'] + # config_mgr.remove_persona(name) # This method was removed from ConfigManager + print(f"Persona management is now done by editing devtooling_labels/config/prompts/personas.py. '{name}' was not removed via CLI.") + print("To remove a persona, please edit the personas.py file directly.") + +if __name__ == '__main__': + cli(obj={}) diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/__init__.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/config_manager.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/config_manager.py new file mode 100644 index 00000000..a9fb5bb1 --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/config_manager.py @@ -0,0 +1,139 @@ +import json +from pathlib import Path +from typing import List, Dict, Any +from .settings import PROJECT_ROOT, GEMINI_API_KEY, OSO_API_KEY, GITHUB_TOKEN, GEMINI_MODEL, OUTPUT_DIR +from .prompts.summary_prompts import SUMMARY_PROMPT, TAGS_PROMPT + + + +class ConfigManager: + def __init__(self, config_file_name: str = "pipeline_config.json"): + self.config_file_path = PROJECT_ROOT / config_file_name + self.config_data = self._load_config() + + def _load_config(self) -> Dict[str, Any]: + """ + Loads configuration from a JSON file, merging it with defaults. + If the file doesn't exist or is invalid, creates a default one. + Values in the JSON file override default values. + """ + default_config = self._get_default_config() + + if self.config_file_path.exists(): + with open(self.config_file_path, 'r') as f: + try: + loaded_config = json.load(f) + # Merge: loaded_config values override default_config values + merged_config = {**default_config, **loaded_config} + return merged_config + except json.JSONDecodeError: + print(f"Warning: Could not decode JSON from {self.config_file_path}. Using full default config.") + # If JSON is corrupt, return the full default config, don't save it over potentially good file yet. + # Or, we could save default_config here if we want to overwrite corrupted file. + # For now, just return defaults for this session. + return default_config + else: + print(f"Config file not found at {self.config_file_path}. Creating and using default config.") + # Save the full default config as the new file + self.save_config(default_config) + return default_config + + def _get_default_config(self) -> Dict[str, Any]: + """Returns the default configuration dictionary.""" + return { + "output_dir": str(OUTPUT_DIR), + "gemini_model": GEMINI_MODEL, + "summary_prompt_template": SUMMARY_PROMPT, + "tags_prompt_template": TAGS_PROMPT, + "test_mode": False, + "test_mode_limit": 5, + "batch_size_summaries": 50, + "batch_size_categorization": 10 # Smaller batch for categorization due to prompt complexity + } + + def save_config(self, config_data: Dict[str, Any] = None): + """Saves the current configuration to the JSON file.""" + data_to_save = config_data if config_data else self.config_data + with open(self.config_file_path, 'w') as f: + json.dump(data_to_save, f, indent=4) + print(f"Configuration saved to {self.config_file_path}") + + def get(self, key: str, default: Any = None) -> Any: + """Gets a configuration value by key.""" + return self.config_data.get(key, default) + + def set(self, key: str, value: Any): + """Sets a configuration value and saves the config.""" + if key in ["gemini_api_key", "oso_api_key", "github_token"]: + print(f"Warning: Attempted to set API key '{key}' in config file. API keys should be managed via .env file.") + return + self.config_data[key] = value + self.save_config() + + # --- API Key Getters --- + def get_gemini_api_key(self) -> str: + """Gets the Gemini API key directly from settings (environment).""" + return GEMINI_API_KEY + + def get_oso_api_key(self) -> str: + """Gets the OSO API key directly from settings (environment).""" + return OSO_API_KEY + + def get_github_token(self) -> str: + """Gets the GitHub token directly from settings (environment).""" + return GITHUB_TOKEN + + # --- Other Getters --- + def get_personas(self) -> List[Dict[str, str]]: + """Gets the list of personas directly from the personas.py module.""" + from .prompts.personas import PERSONAS + return PERSONAS + + # add_persona and remove_persona are removed as personas are managed in personas.py + + def is_test_mode(self) -> bool: + """Checks if test mode is enabled.""" + return self.get("test_mode", False) + + def get_test_mode_limit(self) -> int: + """Gets the limit for test mode.""" + return self.get("test_mode_limit", 5) + + def get_output_dir(self) -> Path: + return Path(self.get("output_dir", str(OUTPUT_DIR))) + + def get_batch_size_summaries(self) -> int: + return self.get("batch_size_summaries", 50) + + def get_batch_size_categorization(self) -> int: + return self.get("batch_size_categorization", 10) + + def get_categories(self) -> List[Dict[str, str]]: + """Gets the categories directly from the categories.py module.""" + from .prompts.categories import CATEGORIES + return CATEGORIES + + def get_category_names(self) -> List[str]: + """Gets the category names directly from the categories.py module.""" + from .prompts.categories import CATEGORY_NAMES + return CATEGORY_NAMES + + def get_summary_prompt_template(self) -> str: + return self.get("summary_prompt_template", "") + + def get_tags_prompt_template(self) -> str: + return self.get("tags_prompt_template", "") + +if __name__ == "__main__": + # Example usage: + config_manager = ConfigManager() + print(f"Output Directory: {config_manager.get_output_dir()}") + print(f"Test Mode: {config_manager.is_test_mode()}") + # Example active print for personas: + print("\nPersonas (from personas.py):") + for p in config_manager.get_personas(): + print(f"- {p['name']}: {p['title']}") + + print("\nCategories (from categories.py):") + for cat_name in config_manager.get_category_names(): + print(f"- {cat_name}") diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/__init__.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/__init__.py new file mode 100644 index 00000000..c22d1787 --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/__init__.py @@ -0,0 +1,11 @@ +from .categories import CATEGORIES, CATEGORY_NAMES +from .personas import PERSONAS +from .summary_prompts import SUMMARY_PROMPT, TAGS_PROMPT + +__all__ = [ + 'CATEGORIES', + 'CATEGORY_NAMES', + 'PERSONAS', + 'SUMMARY_PROMPT', + 'TAGS_PROMPT', +] diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/categories.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/categories.py new file mode 100644 index 00000000..61bae994 --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/categories.py @@ -0,0 +1,106 @@ +CATEGORIES = [ + { + "category": "Language & Compilation Tools", + "description": ( + "Language & Compilation Tools include compilers, interpreters, language servers, " + "and syntax utilities for smart-contract development. They translate high-level " + "source code into EVM bytecode, perform static analysis, and enable features like " + "symbolic execution, forming the foundation for all higher-level tooling." + ) + }, + { + "category": "Core Protocol Interfaces", + "description": ( + "Core Protocol Interfaces are libraries and SDKs that provide reusable building blocks " + "for blockchain developers—smart contract libraries, JSON-RPC clients, transaction builders, " + "wallet and key management, authorization, signature handling, and ABI encoding/decoding. " + "They can power the core operations of many dApps and services." + ) + }, + { + "category": "Development Frameworks", + "description": ( + "Development Frameworks are opinionated, end-to-end toolchains that scaffold, build, " + "test, and deploy smart-contract projects. They bundle CLIs, IDE integrations, task " + "runners, local networks, hot-reloading, and plugin ecosystems to enforce conventions " + "and automate workflows from project setup through to frontend integration." + ) + }, + { + "category": "Deployment & Lifecycle Management", + "description": ( + "Deployment & Lifecycle Management tools handle contract deployment, upgrades, and " + "on-chain migrations. They automate predictable CREATE2 strategies, proxy pattern " + "management, cross-network publishes, and governance hooks, while integrating safety " + "checks and test-suite validations to maintain contract integrity." + ) + }, + { + "category": "Testing & Verification Tools", + "description": ( + "Testing & Verification Tools provide frameworks for unit testing, property-based fuzzing, " + "symbolic execution, formal verification, and coverage analysis. They integrate vulnerability " + "scanners, static analyzers, and coverage reporters to identify edge-case failures and ensure " + "on-chain correctness." + ) + }, + { + "category": "Developer Experience Tools", + "description": ( + "Developer Experience Tools are lightweight plugins and utilities that boost productivity " + "and enforce code consistency. This category includes editor extensions, linters, formatters, " + "code generators, documentation generators, and small CLI helpers." + ) + }, + { + "category": "Infrastructure & Node Operations", + "description": ( + "Infrastructure & Node Operations encompass tools for running, coordinating, and scaling " + "blockchain nodes and peer-to-peer networks. They cover RPC providers, telemetry collectors, " + "log aggregators, gossip-based messaging layers, peer discovery and connection management, " + "and automation scripts to ensure reliable network participation." + ) + }, + { + "category": "Data Indexing & Analytics", + "description": ( + "Data Indexing & Analytics tools ingest, process, and visualize on-chain data. They provide " + "GraphQL and REST APIs over processed datasets, real-time event streaming, and libraries or " + "dashboards for analyzing blockchain metrics." + ) + }, + { + "category": "Interoperability & Cross-chain", + "description": ( + "Interoperability & Cross-chain covers bridging frameworks, cross-chain messaging protocols, " + "and Superchain interoperability tooling. These libraries enable seamless asset transfers, " + "state proofs, and communication across multiple networks." + ) + }, + { + "category": "Cryptography & Primitives", + "description": ( + "Cryptography & Primitives includes low-level cryptographic libraries and building blocks—" + "hash functions, signature schemes, Merkle trees, zero-knowledge proof primitives, and " + "encryption utilities—optimized for security and performance." + ) + }, + { + "category": "Application-Specific & Niche Tools", + "description": ( + "Application-Specific & Niche Tools are libraries and SDKs tailored to very narrow use cases " + "(e.g., DeFi adapters, NFT marketplaces, governance dashboards). They serve specific projects " + "but do not have broad applicability or reusability across the ecosystem." + ) + }, + { + "category": "Others", + "description": ( + "Others is a catch-all for repositories with limited usage or insufficient information—" + "empty projects, single-file utilities, or items that cannot be reasonably categorized." + ) + } +] + +# Create a list of category names for easy access +CATEGORY_NAMES = [cat["category"] for cat in CATEGORIES] diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/personas.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/personas.py new file mode 100644 index 00000000..1ad6ab82 --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/personas.py @@ -0,0 +1,65 @@ +PERSONAS = [ + { + "name": "keyword_spotter", + "title": "Keyword Spotter", + "description": ( + "You focus on explicit keywords in summaries and metadata to quickly map " + "projects to the most likely category." + ), + "prompt": ( + "As a Keyword Spotter, scan the project summary and metadata for tell-tale terms.\n\n" + "Summary: {summary}\n" + "Stars: {star_count} | Forks: {fork_count}\n" + "Created: {created_at} | Updated: {updated_at}\n\n" + "Based on these details, choose one of the categories below:\n" + "{categories}\n\n" + "Respond in JSON:\n" + "{{\n" + ' "assigned_tag": "category name",\n' + ' "reason": "which keywords influenced your decision"\n' + "}}" + ), + }, + { + "name": "senior_strategist", + "title": "Senior Strategist", + "description": ( + "You take a broad, long-term view—considering maturity, community traction, " + "and ecosystem fit—to carefully assign the most appropriate category." + ), + "prompt": ( + "As a Senior Strategist, evaluate the project’s maturity, adoption, and fit.\n\n" + "Summary: {summary}\n" + "Stars: {star_count} | Forks: {fork_count}\n" + "Created: {created_at} | Updated: {updated_at}\n\n" + "Select one of the categories below:\n" + "{categories}\n\n" + "Respond in JSON:\n" + "{{\n" + ' "assigned_tag": "category name",\n' + ' "reason": "holistic rationale covering maturity, adoption, and ecosystem utility"\n' + "}}" + ), + }, + { + "name": "workflow_wizard", + "title": "Workflow Wizard", + "description": ( + "You imagine the ideal developer journey—setup, day-to-day ergonomics, " + "and integration—and assign the category that feels most intuitive." + ), + "prompt": ( + "As a Workflow Wizard, envision how a developer would onboard and use this tool.\n\n" + "Summary: {summary}\n" + "Stars: {star_count} | Forks: {fork_count}\n" + "Created: {created_at} | Updated: {updated_at}\n\n" + "Choose the category that best supports a seamless workflow:\n" + "{categories}\n\n" + "Respond in JSON:\n" + "{{\n" + ' "assigned_tag": "category name",\n' + ' "reason": "analysis based on developer ergonomics and workflow"\n' + "}}" + ), + } +] diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/summary_prompts.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/summary_prompts.py new file mode 100644 index 00000000..cdd27bfb --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/summary_prompts.py @@ -0,0 +1,32 @@ +SUMMARY_PROMPT = ( + "You are an analyst preparing short, neutral briefs on open-source projects. " + "Read the README below and write a **concise, 2- to 3-sentence summary** that:\n" + "• states the project’s core purpose / problem it solves\n" + "• lists its main capabilities or components (1–3 key points only)\n" + "• mentions the primary intended users or systems (e.g., smart-contract developers, node operators)\n" + "• notes any strongly signalled context such as supported programming language, network, or runtime\n" + "\n" + "**Style constraints**\n" + "• Use plain, factual language in third person (no hype, no marketing adjectives).\n" + "• **Do not** guess or invent details that are not explicit in the README.\n" + "• **Do not** label the project with, or copy wording from, the taxonomy below (to avoid category leakage).\n" + "• Limit the summary to <100 words; avoid bullet lists or line breaks.\n" + "\n" + "Return your answer as **exactly one valid JSON object** in this form (nothing extra):\n" + "{{\n" + ' \"summary\": \"your summary here\"\n' + "}}\n" + "\n" + "README:\n" + "{readme_md}" +) + +TAGS_PROMPT = ( + "Based on this project summary, generate a list of relevant tags that " + "describe the project's purpose and functionality.\n\n" + "You must respond with a valid JSON object in this exact format:\n" + "{{\n" + ' "tags": ["tag1", "tag2", "tag3"]\n' + "}}\n\n" + "Summary:\n{summary}" +) diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/settings.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/settings.py new file mode 100644 index 00000000..79c7ebb0 --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/settings.py @@ -0,0 +1,26 @@ +import os +from pathlib import Path +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() + +# API Keys +OSO_API_KEY = os.getenv("OSO_API_KEY") +GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") +GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") + +# Project paths +PROJECT_ROOT = Path(__file__).parent.parent.parent +DATA_DIR = PROJECT_ROOT / "data" +OUTPUT_DIR = PROJECT_ROOT / "output" + +# Create directories if they don't exist +DATA_DIR.mkdir(exist_ok=True) +OUTPUT_DIR.mkdir(exist_ok=True) + +# GitHub API settings +GITHUB_HEADERS = {"Authorization": f"token {GITHUB_TOKEN}"} + +# Gemini model settings +GEMINI_MODEL = "gemini-2.0-flash" \ No newline at end of file diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/__init__.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/__init__.py new file mode 100644 index 00000000..51e9f286 --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/__init__.py @@ -0,0 +1,15 @@ +# This file makes the 'pipeline' directory a Python package. + +from .data_manager import DataManager +from .repository_fetcher import RepositoryFetcherStep +from .summary_generator import SummaryGeneratorStep +from .categorizer import CategorizerStep +from .consolidator import ConsolidatorStep + +__all__ = [ + "DataManager", + "RepositoryFetcherStep", + "SummaryGeneratorStep", + "CategorizerStep", + "ConsolidatorStep", +] diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/categorizer.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/categorizer.py new file mode 100644 index 00000000..67762e9a --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/categorizer.py @@ -0,0 +1,172 @@ +import pandas as pd +from tqdm import tqdm +from .data_manager import DataManager +from ..config.config_manager import ConfigManager +from ..processing.ai_service import AIService, ClassificationOutput + +class CategorizerStep: + def __init__(self, data_manager: DataManager, config_manager: ConfigManager, ai_service: AIService): + self.data_manager = data_manager + self.config_manager = config_manager + self.ai_service = ai_service + + def run(self, force_refresh: bool = False, target_persona_name: str = None, new_only: bool = False): + """ + Categorize projects using AI personas. + Uses batch_size_categorization from config. + + Args: + force_refresh: If True, wipe existing categories and regenerate all + target_persona_name: If specified, only process this persona + new_only: If True, only categorize repositories that don't have categories yet + """ + batch_size = self.config_manager.get_batch_size_categorization() + + if force_refresh: + if target_persona_name: + print(f"Force refresh enabled for persona '{target_persona_name}'. Wiping existing category data for this persona.") + self.data_manager.wipe_categories_data(persona_name=target_persona_name) + else: + print("Force refresh enabled for all personas. Wiping all existing category data.") + self.data_manager.wipe_categories_data() + + # Get summaries data + summaries_df = self.data_manager.get_summaries_data() + if summaries_df.empty: + print("No summarized data found to categorize. Skipping.") + return pd.DataFrame() + + if 'summary' not in summaries_df.columns: + print("Error: 'summary' column not found in summarized data. Cannot categorize.") + return pd.DataFrame() + if 'repo_artifact_id' not in summaries_df.columns: + print("Error: 'repo_artifact_id' not found in summarized data.") + return pd.DataFrame() + + # Get personas to process + personas_to_process = [] + if target_persona_name: + persona = self.config_manager.get_persona(target_persona_name) + if persona: + personas_to_process = [persona] + else: + print(f"Error: Persona '{target_persona_name}' not found.") + return pd.DataFrame() + else: + personas_to_process = self.config_manager.get_personas() + + if not personas_to_process: + print("No personas found to process.") + return pd.DataFrame() + + # Process each persona + for persona in personas_to_process: + persona_name = persona['name'] + print(f"\nProcessing persona: {persona_name}") + + # Get existing categories for this persona if any + existing_categories_df = pd.DataFrame() + if not force_refresh: + try: + existing_categories_df = self.data_manager.get_categories_data(persona_name) + except FileNotFoundError: + pass # No existing categories for this persona + + # If we have existing categories and not forcing refresh + if not existing_categories_df.empty and not force_refresh: + if new_only: + # Filter out repositories that already have categories + existing_repos = set(existing_categories_df['repo_artifact_id']) + repos_to_process = summaries_df[~summaries_df['repo_artifact_id'].isin(existing_repos)] + if repos_to_process.empty: + print(f"No new repositories found to categorize for persona '{persona_name}'.") + continue + print(f"Found {len(repos_to_process)} new repositories to categorize for persona '{persona_name}'.") + else: + print(f"Categories already exist for persona '{persona_name}' and force_refresh is false. Skipping.") + continue + else: + repos_to_process = summaries_df + + # Process in batches + all_categorized_data = [] + for start_idx in tqdm(range(0, len(repos_to_process), batch_size), desc=f"Categorizing ({persona_name})", leave=False): + end_idx = min(start_idx + batch_size, len(repos_to_process)) + batch_df = repos_to_process.iloc[start_idx:end_idx] + + # Prepare list of dicts, each containing summary and metadata for a project + project_data_batch = [] + required_metadata_cols = ['star_count', 'fork_count', 'created_at', 'updated_at'] + for _, row in batch_df.iterrows(): + project_data = { + 'summary': row.get('summary', ''), + 'repo_artifact_id': row.get('repo_artifact_id', 'UNKNOWN_ID') + } + for col in required_metadata_cols: + project_data[col] = row.get(col) # Will be None if missing, pandas NaT for dates + project_data_batch.append(project_data) + + if not project_data_batch or all(not item['summary'] for item in project_data_batch): + print(f"Skipping batch for {persona_name} as all summaries are effectively empty.") + classifications = [ClassificationOutput(assigned_tag="N/A", reason="Empty summary or batch")] * len(project_data_batch) + else: + classifications: List[ClassificationOutput] = self.ai_service.classify_projects_batch_for_persona( + project_data_batch, + persona + ) + + # Create a temporary DataFrame for this batch's results + temp_batch_df = batch_df.copy() + temp_batch_df[f"{persona_name}_tag"] = [c.assigned_tag for c in classifications] + temp_batch_df[f"{persona_name}_reason"] = [c.reason for c in classifications] + all_categorized_data.append(temp_batch_df) + + if not all_categorized_data: + print(f"No categories were generated for persona '{persona_name}'.") + continue + + new_categories_df = pd.concat(all_categorized_data, ignore_index=True) + + # If we have existing categories and not forcing refresh, combine with new ones + if not existing_categories_df.empty and not force_refresh: + final_categories_df = pd.concat([existing_categories_df, new_categories_df], ignore_index=True) + # Remove any duplicates that might have been introduced + final_categories_df = final_categories_df.drop_duplicates( + subset=['repo_artifact_id'], + keep='last' # Keep the new categorization if there was a duplicate + ) + print(f"Combined data now contains {len(final_categories_df)} repositories with categories for persona '{persona_name}'.") + else: + final_categories_df = new_categories_df + + self.data_manager.save_categories_data(final_categories_df, persona_name) + + return pd.DataFrame() # Return empty DataFrame as we've saved the data + + +if __name__ == '__main__': + # Example Usage + cfg_manager = ConfigManager() + ai_svc = AIService(config_manager=cfg_manager) + output_dir = cfg_manager.get_output_dir() + dt_manager = DataManager(output_dir=output_dir, config=cfg_manager) + + if dt_manager.get_summaries_data().empty: + print("No summarized data found. Please run SummaryGeneratorStep first or ensure data exists.") + else: + categorizer_step = CategorizerStep( + data_manager=dt_manager, + config_manager=cfg_manager, + ai_service=ai_svc + ) + print("\nRunning CategorizerStep...") + # Set force_refresh=True to re-categorize. + # Specify target_persona_name="keyword_spotter" to only run for one. + categorized_data = categorizer_step.run(force_refresh=False, target_persona_name=None) + + if not categorized_data.empty: + print(f"\nCategorized data head:\n{categorized_data.head()}") + print(f"Number of rows in categorized data: {len(categorized_data)}") + print(f"Columns: {categorized_data.columns.tolist()}") + else: + print("No data returned from categorization step.") diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/consolidator.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/consolidator.py new file mode 100644 index 00000000..e4fe539f --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/consolidator.py @@ -0,0 +1,183 @@ +import pandas as pd +import numpy as np +from .data_manager import DataManager +from ..config.config_manager import ConfigManager +# from ..config.prompts.tag_mappings import TAG_TO_CATEGORY # Removed + +class ConsolidatorStep: + def __init__(self, data_manager: DataManager, config_manager: ConfigManager): + self.data_manager = data_manager + self.config_manager = config_manager + + def run(self): + """Consolidate and analyze the classification results from all personas.""" + print("\nConsolidating analysis...") + + # Get the merged data from all personas + # DataManager's get_categories_data() without persona_name should provide this. + categorized_df = self.data_manager.get_categories_data() + + if categorized_df.empty: + print("No categorized data found to consolidate. Skipping.") + return pd.DataFrame() + + # Ensure essential columns are present + if 'repo_artifact_id' not in categorized_df.columns and 'project_id' not in categorized_df.columns: + print("Error: 'repo_artifact_id' or 'project_id' not found in categorized data.") + return pd.DataFrame() + + # Use 'project_id' for grouping if available, else 'repo_artifact_id' + # The original code used 'project_id' for project-level aggregation. + # The raw data from OSO has 'project_id'. Summaries and categories should retain it. + + # Identify persona tag columns + personas = self.config_manager.get_personas() + persona_tag_cols = [f"{persona['name']}_tag" for persona in personas if f"{persona['name']}_tag" in categorized_df.columns] + + if not persona_tag_cols: + print("No persona tag columns found in the categorized data. Cannot consolidate.") + return categorized_df # Return as is, or an empty DF + + # Fill NaNs in numeric columns that might be used for weighting (e.g., star_count) + # These columns should ideally come from the raw_repos_data or summaries_data. + # The categorized_df from DataManager should already have these if merged correctly. + numeric_cols_to_fill = ['star_count', 'fork_count', 'num_packages_in_deps_dev'] + for col in numeric_cols_to_fill: + if col in categorized_df.columns: + categorized_df[col] = categorized_df[col].fillna(0) + else: + # If star_count is missing, we can't do weighted summary as originally designed. + # For now, we'll proceed without it if missing. + print(f"Warning: Column '{col}' not found for consolidation. Weighted summary might be affected.") + + # Drop readme_md if it exists, as it's large and not needed for consolidation + if 'readme_md' in categorized_df.columns: + categorized_df = categorized_df.drop(columns=['readme_md']) + + # Group by project_id to consolidate recommendations + # Define grouping keys. project_id is essential. + grouping_keys = ['project_id'] + # Add other descriptive columns that should be unique per project or take the first + if 'display_name' in categorized_df.columns: grouping_keys.append('display_name') + if 'atlas_id' in categorized_df.columns: grouping_keys.append('atlas_id') + + # Ensure grouping keys are valid and exist in the DataFrame + valid_grouping_keys = [key for key in grouping_keys if key in categorized_df.columns] + if 'project_id' not in valid_grouping_keys: + print("Critical error: 'project_id' is missing. Cannot perform project-level consolidation.") + # Save the repo-level data with repo-level recommendations if project_id is missing + # This part re-uses the previous logic for repo-level recommendation if grouping fails + repo_recommendations = [] + if not categorized_df.empty and persona_tag_cols: + for index, row in categorized_df.iterrows(): + assignments = [row[col] for col in persona_tag_cols if pd.notna(row[col]) and row[col] not in ["Error", "N/A", "Other"]] + if assignments: + mode_series = pd.Series(assignments).mode() + repo_recommendations.append(mode_series[0] if not mode_series.empty else 'Other') + else: + repo_recommendations.append('Other') + categorized_df['recommendation'] = repo_recommendations + else: + categorized_df['recommendation'] = 'Other' + self.data_manager.save_consolidated_data(categorized_df) + print("Consolidated analysis saved (repo-level due to missing project_id).") + return categorized_df + + print(f"Consolidating at project level using keys: {valid_grouping_keys}") + + def aggregate_project_data(group): + # New logic for star-weighted recommendation + category_star_weights = {} # Stores sum of stars for each category + + for _, repo_row in group.iterrows(): # Iterate over each repo in the project + stars = repo_row.get('star_count', 0) # star_count was already filled with 0 for NaNs + + # Ensure stars is a non-negative number (already handled by fillna(0) but good practice) + if pd.isna(stars) or not isinstance(stars, (int, float)) or stars < 0: + stars = 0 + else: + stars = int(stars) # Ensure it's an integer for summation + + for p_col in persona_tag_cols: # Iterate over each persona's tag column + category = repo_row.get(p_col) + # Check if category is valid + if pd.notna(category) and category not in ["Error", "N/A", "Other"]: + category_star_weights[category] = category_star_weights.get(category, 0) + stars + + if not category_star_weights: + recommendation = 'Other' + else: + # Find the category with the maximum accumulated star weight + # pd.Series(category_star_weights).idxmax() returns the category (index) with the max value + recommendation = pd.Series(category_star_weights).idxmax() + + # Aggregate other fields + agg_data = { + 'recommendation': recommendation, + 'repo_artifact_namespaces': list(group['repo_artifact_namespace'].unique()) if 'repo_artifact_namespace' in group else [], + 'repo_count': group['repo_artifact_id'].nunique() if 'repo_artifact_id' in group else 0, + 'total_stars': group['star_count'].sum() if 'star_count' in group else 0, + 'total_forks': group['fork_count'].sum() if 'fork_count' in group else 0, + # Add summaries of the top N repos or a combined summary if needed + # For now, let's take the summary of the first repo in the group (by original order) + 'sample_summary': group['summary'].iloc[0] if 'summary' in group and not group['summary'].empty else "" + } + # Add persona tags for the project (e.g., mode of each persona's tags for this project) + for p_col in persona_tag_cols: + persona_project_tags = group[p_col].dropna().tolist() + valid_persona_tags = [tag for tag in persona_project_tags if tag not in ["Error", "N/A", "Other"]] + if valid_persona_tags: + agg_data[f"{p_col}_mode"] = pd.Series(valid_persona_tags).mode()[0] if pd.Series(valid_persona_tags).mode().any() else "N/A" + else: + agg_data[f"{p_col}_mode"] = "N/A" + + return pd.Series(agg_data) + + # Group by valid_grouping_keys and apply aggregation + # Use as_index=False if valid_grouping_keys are to be columns, otherwise they become index + project_consolidated_df = categorized_df.groupby(valid_grouping_keys, as_index=False).apply(aggregate_project_data) + + # If groupby().apply() changes the structure unexpectedly (e.g. multi-index if as_index=True was used) + # ensure project_consolidated_df is flat. With as_index=False, it should be. + # If aggregate_project_data returns a Series, and groupby has as_index=False, + # the result should be a DataFrame where grouping keys are columns, and new columns from Series. + # If apply returns a DataFrame, it might need reset_index(). + # Let's ensure it's flat: + if not isinstance(project_consolidated_df.index, pd.RangeIndex): + project_consolidated_df = project_consolidated_df.reset_index() + + + final_df = project_consolidated_df + + # Save results + print(f"\nSaving consolidated analysis (project-level)...") + self.data_manager.save_consolidated_data(final_df) + print("Consolidated analysis saved successfully.") + return final_df + +if __name__ == '__main__': + # Example Usage + cfg_manager = ConfigManager() + output_dir = cfg_manager.get_output_dir() + dt_manager = DataManager(output_dir=output_dir, config=cfg_manager) + + # Ensure categorized data exists (run categorizer.py example first if needed) + # DataManager's get_categories_data() should merge individual persona files. + if dt_manager.get_categories_data().empty: + print("No categorized data found. Please run CategorizerStep first or ensure data exists.") + else: + consolidator_step = ConsolidatorStep( + data_manager=dt_manager, + config_manager=cfg_manager + ) + print("\nRunning ConsolidatorStep...") + consolidated_df = consolidator_step.run() + + if not consolidated_df.empty: + print(f"\nConsolidated data head:\n{consolidated_df.head()}") + print(f"Number of rows in consolidated data: {len(consolidated_df)}") + print(f"Consolidated columns: {consolidated_df.columns.tolist()}") + print(f"\nRecommendations sample:\n{consolidated_df[['project_id', 'display_name', 'recommendation']].head() if 'project_id' in consolidated_df.columns and 'display_name' in consolidated_df.columns else consolidated_df['recommendation'].head()}") + + else: + print("No data returned from consolidation step.") diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/data_manager.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/data_manager.py new file mode 100644 index 00000000..4c8eef66 --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/data_manager.py @@ -0,0 +1,292 @@ +import pandas as pd +import json +from pathlib import Path +import shutil +from typing import List, Dict, Any +from ..config.settings import PROJECT_ROOT + +class DataManager: + def __init__(self, output_dir: Path, config=None): + self.output_dir = output_dir + self.config = config # For future use, e.g., different storage backends + + # Legacy paths + self.raw_parquet_path = self.output_dir / "devtooling_raw.parquet" + self.summarized_parquet_path = self.output_dir / "devtooling_summarized.parquet" + self.categorized_dir = self.output_dir / "categorized" + self.final_parquet_path = self.output_dir / "devtooling_full.parquet" + self.consolidated_csv_path = self.output_dir / "devtooling_consolidated.csv" + + # New unified data paths - ensure they're in the current repo's output directory + local_output_dir = Path(PROJECT_ROOT) / "output" + local_output_dir.mkdir(parents=True, exist_ok=True) + self.unified_parquet_path = local_output_dir / "ethereum_repos_unified.parquet" + self.unified_csv_path = local_output_dir / "ethereum_repos_unified.csv" + + # Create directories if they don't exist + self.output_dir.mkdir(parents=True, exist_ok=True) + self.categorized_dir.mkdir(parents=True, exist_ok=True) + + def get_repos_data(self) -> pd.DataFrame: + """Get the latest repository data""" + if self.raw_parquet_path.exists(): + return pd.read_parquet(self.raw_parquet_path) + return pd.DataFrame() + + def get_summaries_data(self) -> pd.DataFrame: + """Get the latest summaries data""" + if self.summarized_parquet_path.exists(): + return pd.read_parquet(self.summarized_parquet_path) + return pd.DataFrame() + + def get_categories_data(self, persona_name: str = None) -> pd.DataFrame: + """Get the latest categories data, optionally for a specific persona or all.""" + if persona_name: + persona_file = self.categorized_dir / f"{persona_name}.parquet" + if persona_file.exists(): + return pd.read_parquet(persona_file) + return pd.DataFrame() + else: + # Combine all persona files + all_persona_dfs = [] + for persona_file in self.categorized_dir.glob("*.parquet"): + df = pd.read_parquet(persona_file) + all_persona_dfs.append(df) + + if not all_persona_dfs: + return pd.DataFrame() + + # Concatenate all dataframes. If a project appears in multiple files, + # the last one read will take precedence for shared columns (like 'summary'). + # Persona-specific columns (e.g., 'persona_X_tag') will be unique. + # We need a more robust way to merge these if there are overlapping non-persona columns. + # For now, assuming 'project_id' or 'repo_artifact_id' is the key. + + # A simple concat might lead to duplicate columns if not handled carefully. + # Let's assume each persona file has unique columns for its tags/reasons. + # And common columns like 'project_id', 'summary' are present. + + # Start with the summaries data as the base + base_df = self.get_summaries_data() + if base_df.empty: + # If no summaries, try to load from the first persona file as a base + if all_persona_dfs: + base_df = all_persona_dfs[0][['project_id', 'repo_artifact_id', 'summary']].copy() # Adjust columns as needed + else: + return pd.DataFrame() + + + # Set index for joining + if 'repo_artifact_id' in base_df.columns: + base_df = base_df.set_index('repo_artifact_id') + elif 'project_id' in base_df.columns: + base_df = base_df.set_index('project_id') + else: + # Fallback if no clear index, this might lead to issues + print("Warning: No clear index (project_id or repo_artifact_id) for merging category data.") + + + for df_persona in all_persona_dfs: + # Identify the persona name from its columns (e.g., "keyword_spotter_tag") + current_persona_name = None + for col_name in df_persona.columns: + if col_name.endswith("_tag"): + current_persona_name = col_name.replace("_tag", "") + break + + if not current_persona_name: + print(f"Warning: Could not determine persona name from columns in a categorized file. Skipping this file.") + continue + + # Columns to join are just the tag and reason for this specific persona + persona_tag_col = f"{current_persona_name}_tag" + persona_reason_col = f"{current_persona_name}_reason" + + cols_from_persona_df = [] + if persona_tag_col in df_persona.columns: + cols_from_persona_df.append(persona_tag_col) + if persona_reason_col in df_persona.columns: + cols_from_persona_df.append(persona_reason_col) + + if not cols_from_persona_df: + print(f"Warning: No tag/reason columns found for persona {current_persona_name} in its file. Skipping join for this persona.") + continue + + # Set index for df_persona before selecting columns for join + if base_df.index.name in df_persona.columns: # base_df.index.name is 'repo_artifact_id' or 'project_id' + df_persona_indexed = df_persona.set_index(base_df.index.name) + else: + print(f"Warning: Index column '{base_df.index.name}' not found in persona DataFrame for {current_persona_name}. Attempting join without re-indexing persona df, might be incorrect.") + df_persona_indexed = df_persona # This might lead to issues if not indexed properly + + # Ensure only existing columns are selected from df_persona_indexed + valid_cols_to_join = [col for col in cols_from_persona_df if col in df_persona_indexed.columns] + + if not valid_cols_to_join: + print(f"Warning: Persona specific columns {cols_from_persona_df} not found as actual columns in indexed persona dataframe for {current_persona_name}. Skipping join for this persona.") + continue + + base_df = base_df.join(df_persona_indexed[valid_cols_to_join], how='left', rsuffix=f'_{current_persona_name}_dup') + + # Clean up duplicate columns if any (this is a basic cleanup for rsuffix) + cols_to_drop = [col for col in base_df.columns if '_dup' in col] + base_df.drop(columns=cols_to_drop, inplace=True, errors='ignore') + + return base_df.reset_index() + + + def save_repos_data(self, data: pd.DataFrame): + """Save repository data""" + data.to_parquet(self.raw_parquet_path, index=False) + print(f"Repository data saved to {self.raw_parquet_path}") + + def save_summaries_data(self, data: pd.DataFrame, append: bool = False): + """Save summaries data. If append is True, appends to existing file if it exists.""" + if append and self.summarized_parquet_path.exists(): + existing_df = pd.read_parquet(self.summarized_parquet_path) + # Ensure no duplicate columns before concat, especially if 'summary' is regenerated + # A more robust merge/update might be needed depending on exact requirements + data_to_save = pd.concat([existing_df, data]).drop_duplicates(subset=['repo_artifact_id'], keep='last') # Assuming repo_artifact_id is unique key + else: + data_to_save = data + data_to_save.to_parquet(self.summarized_parquet_path, index=False) + print(f"Summaries data saved to {self.summarized_parquet_path}") + + def save_categories_data(self, data: pd.DataFrame, persona_name: str): + """Save categories data for a specific persona""" + persona_file = self.categorized_dir / f"{persona_name}.parquet" + data.to_parquet(persona_file, index=False) + print(f"Categories data for persona {persona_name} saved to {persona_file}") + + def save_consolidated_data(self, data: pd.DataFrame): + """Save consolidated data to Parquet and CSV""" + data.to_parquet(self.final_parquet_path, index=False) + print(f"Consolidated Parquet data saved to {self.final_parquet_path}") + data.to_csv(self.consolidated_csv_path, index=False) + print(f"Consolidated CSV data saved to {self.consolidated_csv_path}") + + def wipe_repos_data(self): + """Wipe repository data""" + if self.raw_parquet_path.exists(): + self.raw_parquet_path.unlink() + print(f"Wiped repository data: {self.raw_parquet_path}") + + def wipe_summaries_data(self): + """Wipe summaries data""" + if self.summarized_parquet_path.exists(): + self.summarized_parquet_path.unlink() + print(f"Wiped summaries data: {self.summarized_parquet_path}") + + def wipe_categories_data(self, persona_name: str = None): + """Wipe categories data, optionally for a specific persona or all.""" + if persona_name: + persona_file = self.categorized_dir / f"{persona_name}.parquet" + if persona_file.exists(): + persona_file.unlink() + print(f"Wiped categories data for persona {persona_name}: {persona_file}") + else: + if self.categorized_dir.exists(): + shutil.rmtree(self.categorized_dir) + self.categorized_dir.mkdir(parents=True, exist_ok=True) # Recreate after wiping + print(f"Wiped all categories data in {self.categorized_dir}") + + def has_categories_for_persona(self, persona_name: str) -> bool: + """Check if category data exists for a specific persona.""" + persona_file = self.categorized_dir / f"{persona_name}.parquet" + return persona_file.exists() + + def get_final_parquet_path(self) -> Path: + return self.final_parquet_path + + def get_consolidated_csv_path(self) -> Path: + return self.consolidated_csv_path + + # New methods for unified data structure + + def save_unified_data(self, data: pd.DataFrame): + """ + Save unified repository data to Parquet and CSV. + This data includes all repositories, summaries, and categorizations in a single structure. + """ + # Ensure categorizations column is properly serialized for Parquet + if 'categorizations' in data.columns: + # Convert categorizations to strings for storage + # This is necessary because Parquet doesn't handle complex nested structures well + data_copy = data.copy() + data_copy['categorizations_json'] = data_copy['categorizations'].apply( + lambda x: json.dumps(x) if isinstance(x, list) else '[]' + ) + + # Save to Parquet (without the original categorizations column) + parquet_data = data_copy.drop(columns=['categorizations']) + parquet_data.to_parquet(self.unified_parquet_path, index=False) + print(f"Unified data saved to {self.unified_parquet_path}") + + # Save to CSV for easier viewing (also without the complex column) + csv_data = parquet_data.copy() + + # Remove README text and truncate long text fields for CSV readability + if 'readme_md' in csv_data.columns: + csv_data = csv_data.drop(columns=['readme_md']) + + if 'summary' in csv_data.columns: + csv_data['summary'] = csv_data['summary'].apply( + lambda x: (x[:100] + '...') if isinstance(x, str) and len(x) > 100 else x + ) + + # Truncate other potentially long text fields + for col in ['categorizations_json']: + if col in csv_data.columns: + csv_data[col] = csv_data[col].apply( + lambda x: (x[:50] + '...') if isinstance(x, str) and len(x) > 50 else x + ) + + csv_data.to_csv(self.unified_csv_path, index=False) + print(f"Unified CSV data saved to {self.unified_csv_path} (README text removed)") + else: + # If no categorizations column, save as is + data.to_parquet(self.unified_parquet_path, index=False) + print(f"Unified data saved to {self.unified_parquet_path}") + + # Create a readable CSV version + csv_data = data.copy() + + # Remove README text and truncate long text fields for CSV readability + if 'readme_md' in csv_data.columns: + csv_data = csv_data.drop(columns=['readme_md']) + + if 'summary' in csv_data.columns: + csv_data['summary'] = csv_data['summary'].apply( + lambda x: (x[:100] + '...') if isinstance(x, str) and len(x) > 100 else x + ) + + csv_data.to_csv(self.unified_csv_path, index=False) + print(f"Unified CSV data saved to {self.unified_csv_path} (README text removed)") + + def get_unified_data(self) -> pd.DataFrame: + """ + Get the unified repository data with properly deserialized categorizations. + """ + if not self.unified_parquet_path.exists(): + return pd.DataFrame() + + # Load the data from Parquet + data = pd.read_parquet(self.unified_parquet_path) + + # Deserialize the categorizations from JSON if present + if 'categorizations_json' in data.columns: + data['categorizations'] = data['categorizations_json'].apply( + lambda x: json.loads(x) if isinstance(x, str) else [] + ) + data = data.drop(columns=['categorizations_json']) + + return data + + def wipe_unified_data(self): + """Wipe unified data files""" + if self.unified_parquet_path.exists(): + self.unified_parquet_path.unlink() + print(f"Wiped unified data: {self.unified_parquet_path}") + if self.unified_csv_path.exists(): + self.unified_csv_path.unlink() + print(f"Wiped unified CSV data: {self.unified_csv_path}") diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/repository_fetcher.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/repository_fetcher.py new file mode 100644 index 00000000..fc5f398f --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/repository_fetcher.py @@ -0,0 +1,125 @@ +import pandas as pd +from .data_manager import DataManager +from ..config.config_manager import ConfigManager +from ..processing.fetcher import DataFetcher + +class RepositoryFetcherStep: + def __init__(self, data_manager: DataManager, config_manager: ConfigManager): + self.data_manager = data_manager + self.config_manager = config_manager + self.fetcher = DataFetcher() # Assuming DataFetcher doesn't need config for initialization + + def run(self, force_refresh: bool = False, fetch_new_only: bool = False): + """ + Fetch repositories and READMEs. + Uses test_mode and test_mode_limit from config if test_mode is enabled. + + Args: + force_refresh: If True, wipe existing data and fetch everything fresh + fetch_new_only: If True, only fetch repositories that don't exist in current data + """ + limit = None + sort_by_stars_in_test = False + is_test = self.config_manager.is_test_mode() + + if is_test: + limit = self.config_manager.get_test_mode_limit() + sort_by_stars_in_test = True # Always sort by stars in test mode as per new req + print(f"Running in TEST MODE: Targeting up to {limit} repositories, sorted by stars DESC.") + + if force_refresh: + print("Force refresh enabled for repository data. Wiping existing raw data.") + self.data_manager.wipe_repos_data() + existing_df = pd.DataFrame() + else: + existing_df = self.data_manager.get_repos_data() + if not existing_df.empty: + if fetch_new_only: + print("Fetching only new repositories while keeping existing ones...") + else: + print("Repository data already exists and force_refresh is false.") + if is_test: + if 'star_count' in existing_df.columns: + print(f"Applying test mode (sort by stars, limit {limit}) to existing data.") + sorted_df = existing_df.sort_values(by='star_count', ascending=False) + return sorted_df.head(limit) + else: + print(f"Warning: 'star_count' not in existing data. Using first {limit} entries for test mode.") + return existing_df.head(limit) + return existing_df # Not test mode, return all existing + + # If here, either force_refresh is true or data doesn't exist. + print("Fetching repositories from OSO...") + # Pass sort_by_stars only if in test_mode, limit is passed anyway (None if not test) + new_repos_df = self.fetcher.fetch_repositories(limit=limit, sort_by_stars=sort_by_stars_in_test) + + if new_repos_df.empty: + print("No repositories found from OSO fetch.") + # Save an empty DataFrame to indicate the step ran + self.data_manager.save_repos_data(pd.DataFrame()) + return pd.DataFrame() + + print(f"Found {len(new_repos_df)} repositories from OSO.") + + if fetch_new_only and not existing_df.empty: + # Filter out repositories that already exist + existing_repos = set(zip(existing_df['repo_artifact_namespace'], existing_df['repo_artifact_name'])) + new_repos_df = new_repos_df[~new_repos_df.apply( + lambda x: (x['repo_artifact_namespace'], x['repo_artifact_name']) in existing_repos, + axis=1 + )] + print(f"Found {len(new_repos_df)} new repositories to process.") + + if new_repos_df.empty: + print("No new repositories to process.") + return existing_df + + print("Fetching READMEs from GitHub...") + # Ensure 'repo_artifact_namespace' and 'repo_artifact_name' exist + if 'repo_artifact_namespace' not in new_repos_df.columns or 'repo_artifact_name' not in new_repos_df.columns: + print("Error: 'repo_artifact_namespace' or 'repo_artifact_name' not in fetched data.") + # Save what we have so far + self.data_manager.save_repos_data(new_repos_df) + return new_repos_df # Or handle error more gracefully + + new_repos_df = self.fetcher.get_all_readmes(new_repos_df) + print(f"Retrieved READMEs for {len(new_repos_df[new_repos_df['readme_md'] != ''])} repositories.") + + # Combine existing and new data + if not existing_df.empty: + combined_df = pd.concat([existing_df, new_repos_df], ignore_index=True) + # Remove any duplicates that might have been introduced + combined_df = combined_df.drop_duplicates( + subset=['repo_artifact_namespace', 'repo_artifact_name'], + keep='first' + ) + print(f"Combined data now contains {len(combined_df)} repositories.") + self.data_manager.save_repos_data(combined_df) + + # If in test mode and combined data exceeds limit + if limit is not None and len(combined_df) > limit: + if 'star_count' in combined_df.columns: + return combined_df.sort_values(by='star_count', ascending=False).head(limit) + return combined_df.head(limit) + return combined_df + else: + self.data_manager.save_repos_data(new_repos_df) + # If in test mode and fetched more than limit + if limit is not None and len(new_repos_df) > limit: + return new_repos_df.head(limit) + return new_repos_df + +if __name__ == '__main__': + # Example Usage (requires .env file and OSO/GitHub credentials) + # Ensure pipeline_config.json exists or is created with defaults + cfg_manager = ConfigManager() + + output_dir = cfg_manager.get_output_dir() + dt_manager = DataManager(output_dir=output_dir, config=cfg_manager) + + repo_fetch_step = RepositoryFetcherStep(data_manager=dt_manager, config_manager=cfg_manager) + + print("\nRunning RepositoryFetcherStep...") + fetched_data = repo_fetch_step.run(force_refresh=False) # Set True to wipe and refetch + print(f"\nFetched data head:\n{fetched_data.head()}") + print(f"Number of rows fetched: {len(fetched_data)}") diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/summary_generator.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/summary_generator.py new file mode 100644 index 00000000..f09b0d78 --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/summary_generator.py @@ -0,0 +1,131 @@ +import pandas as pd +from tqdm import tqdm +from .data_manager import DataManager +from ..config.config_manager import ConfigManager +from ..processing.ai_service import AIService, SummaryOutput + +class SummaryGeneratorStep: + def __init__(self, data_manager: DataManager, config_manager: ConfigManager, ai_service: AIService): + self.data_manager = data_manager + self.config_manager = config_manager + self.ai_service = ai_service + + def run(self, force_refresh: bool = False, new_only: bool = False): + """ + Generate summaries for repositories. + Uses batch_size_summaries from config. + + Args: + force_refresh: If True, wipe existing summaries and regenerate all + new_only: If True, only generate summaries for repositories that don't have them yet + """ + batch_size = self.config_manager.get_batch_size_summaries() + + if force_refresh: + print("Force refresh enabled for summaries. Wiping existing summarized data.") + self.data_manager.wipe_summaries_data() + existing_summaries_df = pd.DataFrame() + else: + existing_summaries_df = self.data_manager.get_summaries_data() + + # Get repository data + repos_df = self.data_manager.get_repos_data() + if repos_df.empty: + print("No repository data found to generate summaries. Skipping.") + # Save an empty DataFrame to indicate the step ran if forced + if force_refresh or not self.data_manager.summarized_parquet_path.exists(): + self.data_manager.save_summaries_data(pd.DataFrame()) + return pd.DataFrame() + + # If we have existing summaries and not forcing refresh + if not existing_summaries_df.empty and not force_refresh: + if new_only: + # Filter out repositories that already have summaries + existing_repos = set(existing_summaries_df['repo_artifact_id']) + repos_to_process = repos_df[~repos_df['repo_artifact_id'].isin(existing_repos)] + if repos_to_process.empty: + print("No new repositories found to generate summaries for.") + return existing_summaries_df + print(f"Found {len(repos_to_process)} new repositories to generate summaries for.") + else: + print("Summarized data already exists and force_refresh is false. Skipping summary generation.") + return existing_summaries_df + else: + repos_to_process = repos_df + + # Ensure 'readme_md' and 'repo_artifact_id' columns exist + if 'readme_md' not in repos_to_process.columns: + print("Error: 'readme_md' column not found in repository data. Cannot generate summaries.") + return pd.DataFrame() + if 'repo_artifact_id' not in repos_to_process.columns: + print("Error: 'repo_artifact_id' column not found. This ID is crucial.") + return pd.DataFrame() + + print(f"Generating summaries for {len(repos_to_process)} repositories in batches of {batch_size}...") + + all_summaries_data = [] # To collect all rows with new summaries + + # Process in batches + for start_idx in tqdm(range(0, len(repos_to_process), batch_size), desc="Generating Summaries"): + end_idx = min(start_idx + batch_size, len(repos_to_process)) + batch_df_initial = repos_to_process.iloc[start_idx:end_idx] + + # Create a working copy for this batch to add summaries + batch_df_processed = batch_df_initial.copy() + + summaries = [] + for _, row in batch_df_initial.iterrows(): + readme_content = row.get('readme_md', "") + summary_output: SummaryOutput = self.ai_service.make_summary(readme_content) + summaries.append(summary_output.summary) + + batch_df_processed["summary"] = summaries + all_summaries_data.append(batch_df_processed) + + if not all_summaries_data: + print("No summaries were generated.") + # Save an empty DataFrame if no summaries were made but the step was intended to run + if force_refresh or not self.data_manager.summarized_parquet_path.exists(): + self.data_manager.save_summaries_data(pd.DataFrame()) + return pd.DataFrame() + + new_summaries_df = pd.concat(all_summaries_data, ignore_index=True) + + # If we have existing summaries and not forcing refresh, combine with new ones + if not existing_summaries_df.empty and not force_refresh: + final_summarized_df = pd.concat([existing_summaries_df, new_summaries_df], ignore_index=True) + # Remove any duplicates that might have been introduced + final_summarized_df = final_summarized_df.drop_duplicates( + subset=['repo_artifact_id'], + keep='last' # Keep the new summary if there was a duplicate + ) + print(f"Combined data now contains {len(final_summarized_df)} repositories with summaries.") + else: + final_summarized_df = new_summaries_df + + self.data_manager.save_summaries_data(final_summarized_df) + + return final_summarized_df + +if __name__ == '__main__': + # Example Usage + cfg_manager = ConfigManager() + ai_svc = AIService(config_manager=cfg_manager) + output_dir = cfg_manager.get_output_dir() + dt_manager = DataManager(output_dir=output_dir, config=cfg_manager) + + # Ensure repo data exists (run repo_fetcher.py example first if needed) + if dt_manager.get_repos_data().empty: + print("No repository data found. Please run RepositoryFetcherStep first or ensure data exists.") + else: + summary_gen_step = SummaryGeneratorStep( + data_manager=dt_manager, + config_manager=cfg_manager, + ai_service=ai_svc + ) + + print("\nRunning SummaryGeneratorStep...") + # Set force_refresh=True to regenerate even if file exists + summarized_data = summary_gen_step.run(force_refresh=False) + print(f"\nSummarized data head:\n{summarized_data.head()}") + print(f"Number of rows with summaries: {len(summarized_data)}") diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/unified_processor.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/unified_processor.py new file mode 100644 index 00000000..c5f310c7 --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/unified_processor.py @@ -0,0 +1,279 @@ +import pandas as pd +import datetime +import json +from typing import List, Dict, Any, Optional +from tqdm import tqdm +from .data_manager import DataManager +from ..config.config_manager import ConfigManager +from ..processing.ai_service import AIService, SummaryOutput, ClassificationOutput +from ..processing.fetcher import DataFetcher + +class UnifiedProcessor: + def __init__(self, data_manager: DataManager, config_manager: ConfigManager, ai_service: AIService): + self.data_manager = data_manager + self.config_manager = config_manager + self.ai_service = ai_service + self.fetcher = DataFetcher() + + def run(self, + force_refresh: bool = False, + include_forks: bool = False, + inactive_repos: bool = False, + limit: Optional[int] = None): + """ + Unified processing pipeline that fetches repositories, READMEs, generates summaries, + and categorizes them in a single pass. + + Args: + force_refresh: If True, wipe existing data and process everything fresh + include_forks: If True, include forked repositories in processing + inactive_repos: If True, include repositories not updated in the last year + limit: Optional limit on number of repositories to process + """ + # Get test mode settings + is_test = self.config_manager.is_test_mode() + if is_test: + test_limit = self.config_manager.get_test_mode_limit() + if limit is None or limit > test_limit: + limit = test_limit + print(f"Running in TEST MODE: Limiting to {limit} repositories, sorted by stars DESC.") + + # Determine batch sizes + batch_size = min( + self.config_manager.get_batch_size_summaries(), + self.config_manager.get_batch_size_categorization() + ) + + # Get existing data if not forcing refresh + if force_refresh: + print("Force refresh enabled. Wiping existing data.") + self.data_manager.wipe_repos_data() + existing_df = pd.DataFrame() + else: + existing_df = self.data_manager.get_repos_data() + if not existing_df.empty: + print(f"Found existing data with {len(existing_df)} repositories.") + + # Fetch repositories from OSO + print("Fetching repositories from OSO...") + repos_df = self.fetcher.fetch_repositories(limit=limit, sort_by_stars=True) + + if repos_df.empty: + print("No repositories found from OSO fetch.") + return pd.DataFrame() + + print(f"Found {len(repos_df)} repositories from OSO.") + + # Filter repositories based on parameters + if not include_forks: + repos_df = repos_df[~repos_df['is_fork']] + print(f"Filtered out forks. {len(repos_df)} repositories remaining.") + + if not inactive_repos: + repos_df = repos_df[repos_df['is_actively_maintained']] + print(f"Filtered out inactive repositories. {len(repos_df)} repositories remaining.") + + # Determine which repositories need processing + if not existing_df.empty and not force_refresh: + # Identify repositories that have already been fully processed + processed_repos = set() + if 'categorizations' in existing_df.columns: + processed_repos = set( + existing_df[existing_df['categorizations'].apply(lambda x: isinstance(x, list) and len(x) > 0)]['repo_artifact_id'] + ) + + # Filter out already processed repositories + repos_to_process = repos_df[~repos_df['repo_artifact_id'].isin(processed_repos)] + print(f"Found {len(repos_to_process)} repositories that need processing.") + + # Combine with existing data for final output + combined_df = pd.concat([ + existing_df[existing_df['repo_artifact_id'].isin(processed_repos)], + self._process_repositories(repos_to_process, batch_size) + ], ignore_index=True) + + # Save the combined data + self.data_manager.save_unified_data(combined_df) + return combined_df + else: + # Process all repositories + processed_df = self._process_repositories(repos_df, batch_size) + self.data_manager.save_unified_data(processed_df) + return processed_df + + def _process_repositories(self, repos_df: pd.DataFrame, batch_size: int) -> pd.DataFrame: + """ + Process repositories in batches: fetch READMEs, generate summaries, and categorize. + + Args: + repos_df: DataFrame containing repositories to process + batch_size: Number of repositories to process in each batch + + Returns: + DataFrame with processed repositories + """ + print(f"Processing {len(repos_df)} repositories in batches of {batch_size}...") + + # Get personas for categorization + personas = self.config_manager.get_personas() + if not personas: + print("No personas found for categorization.") + return repos_df + + # Process in batches + all_processed_data = [] + + for start_idx in tqdm(range(0, len(repos_df), batch_size), desc="Processing Repositories"): + end_idx = min(start_idx + batch_size, len(repos_df)) + batch_df = repos_df.iloc[start_idx:end_idx].copy() + + # Fetch READMEs for this batch + batch_df = self.fetcher.get_all_readmes(batch_df) + + # Initialize the categorizations column with empty lists + batch_df['categorizations'] = [[] for _ in range(len(batch_df))] + batch_df['final_recommendation'] = 'UNCATEGORIZED' + batch_df['processing_timestamp'] = datetime.datetime.now().isoformat() + batch_df['summary'] = '' + + # Process each repository in the batch + for idx, row in tqdm(batch_df.iterrows(), desc="Processing repositories in batch", total=len(batch_df), leave=False): + # Initialize categorizations list + categorizations = [] + + # Get README status + readme_status = row.get('readme_status', 'ERROR') + + # Generate summary if README is available + summary = "" + if readme_status == "SUCCESS": + readme_content = row.get('readme_md', "") + summary_output: SummaryOutput = self.ai_service.make_summary(readme_content) + summary = summary_output.summary + + # Categorize with each persona + for persona in tqdm(personas, desc=f"Categorizing {row.get('repo_artifact_name', 'repo')} with personas", leave=False): + try: + # Prepare project data for categorization + project_data = { + 'summary': summary, + 'repo_artifact_id': row.get('repo_artifact_id', 'UNKNOWN_ID'), + 'star_count': row.get('star_count', 0), + 'fork_count': row.get('fork_count', 0), + 'created_at': row.get('created_at'), + 'updated_at': row.get('updated_at') + } + + # Get categorization from this persona + classifications = self.ai_service.classify_projects_batch_for_persona( + [project_data], + persona + ) + + if classifications and len(classifications) > 0: + classification = classifications[0] + categorizations.append({ + 'persona_name': persona['name'], + 'category': classification.assigned_tag, + 'reason': classification.reason, + 'timestamp': datetime.datetime.now().isoformat() + }) + else: + categorizations.append({ + 'persona_name': persona['name'], + 'category': 'UNCATEGORIZED', + 'reason': 'Failed to get classification from AI service', + 'timestamp': datetime.datetime.now().isoformat() + }) + except Exception as e: + print(f"Error categorizing with persona {persona['name']}: {e}") + categorizations.append({ + 'persona_name': persona['name'], + 'category': 'UNCATEGORIZED', + 'reason': f'Error: {str(e)}', + 'timestamp': datetime.datetime.now().isoformat() + }) + else: + # If README is empty or error, mark all categorizations as UNCATEGORIZED + for persona in tqdm(personas, desc=f"Marking {row.get('repo_artifact_name', 'repo')} as UNCATEGORIZED", leave=False): + categorizations.append({ + 'persona_name': persona['name'], + 'category': 'UNCATEGORIZED', + 'reason': f'README {readme_status}', + 'timestamp': datetime.datetime.now().isoformat() + }) + + # Determine final recommendation based on categorizations + final_recommendation = self._determine_final_recommendation(categorizations, row.get('star_count', 0)) + + # Update the row with processed data + batch_df.at[idx, 'summary'] = summary + batch_df.at[idx, 'categorizations'] = categorizations + batch_df.at[idx, 'final_recommendation'] = final_recommendation + batch_df.at[idx, 'processing_timestamp'] = datetime.datetime.now().isoformat() + + all_processed_data.append(batch_df) + + if not all_processed_data: + print("No data was processed.") + return pd.DataFrame() + + return pd.concat(all_processed_data, ignore_index=True) + + def _determine_final_recommendation(self, categorizations: List[Dict[str, Any]], star_count: int) -> str: + """ + Determine the final recommendation based on categorizations from all personas. + + Args: + categorizations: List of categorization dictionaries + star_count: Star count of the repository (for potential future weighting) + + Returns: + Final category recommendation + """ + # Filter out UNCATEGORIZED entries + valid_categories = [c['category'] for c in categorizations if c['category'] != 'UNCATEGORIZED'] + + if not valid_categories: + return 'UNCATEGORIZED' + + # Count occurrences of each category + category_counts = {} + for category in valid_categories: + category_counts[category] = category_counts.get(category, 0) + 1 + + # Find the most common category + max_count = 0 + final_category = 'UNCATEGORIZED' + + for category, count in category_counts.items(): + if count > max_count: + max_count = count + final_category = category + + return final_category + + +if __name__ == '__main__': + # Example Usage + cfg_manager = ConfigManager() + ai_svc = AIService(config_manager=cfg_manager) + output_dir = cfg_manager.get_output_dir() + dt_manager = DataManager(output_dir=output_dir, config=cfg_manager) + + processor = UnifiedProcessor( + data_manager=dt_manager, + config_manager=cfg_manager, + ai_service=ai_svc + ) + + print("\nRunning UnifiedProcessor...") + processed_data = processor.run( + force_refresh=False, + include_forks=False, + inactive_repos=False + ) + + if not processed_data.empty: + print(f"\nProcessed data head:\n{processed_data.head()}") + print(f"Number of rows processed: {len(processed_data)}") diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/__init__.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/__init__.py new file mode 100644 index 00000000..e1ebbcd8 --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/__init__.py @@ -0,0 +1,10 @@ +from .fetcher import DataFetcher +from .ai_service import AIService + +# The old Summarizer class has been effectively replaced by AIService +# and the pipeline steps (SummaryGeneratorStep, CategorizerStep). + +__all__ = [ + "DataFetcher", + "AIService", +] diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/ai_service.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/ai_service.py new file mode 100644 index 00000000..074767dd --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/ai_service.py @@ -0,0 +1,263 @@ +import json +import pandas as pd +import time +from dataclasses import dataclass, asdict +from typing import List, Type, TypeVar, Union, Dict, Any +import google.generativeai as genai +from ..config.config_manager import ConfigManager + +# Define generic type for output classes +T = TypeVar( + 'T', + bound=Union['SummaryOutput', 'TagsOutput', 'ClassificationOutput', 'BatchClassificationOutput'] +) + +@dataclass +class SummaryOutput: + summary: str + +@dataclass +class TagsOutput: + tags: List[str] + +@dataclass +class ClassificationOutput: + assigned_tag: str + reason: str + +@dataclass +class BatchClassificationOutput: + classifications: List[ClassificationOutput] + + +class AIService: + def __init__(self, config_manager: ConfigManager): + self.config_manager = config_manager + self.api_key = self.config_manager.get_gemini_api_key() # Use specific getter + self.model_name = self.config_manager.get("gemini_model") # Model name can stay in JSON config + + if not self.api_key: + raise ValueError("GEMINI_API_KEY not found in configuration.") + if not self.model_name: + raise ValueError("GEMINI_MODEL not found in configuration.") + + genai.configure(api_key=self.api_key) + self.model = genai.GenerativeModel(self.model_name) + self.request_count = 0 + self.start_time = time.time() + + def _rate_limit_control(self): + """Basic rate limiting: 60 requests per minute for flash models.""" + self.request_count += 1 + elapsed_time = time.time() - self.start_time + if elapsed_time < 60 and self.request_count > 55: # Slight safety margin + sleep_time = 60 - elapsed_time + print(f"Rate limit approaching. Sleeping for {sleep_time:.2f} seconds.") + time.sleep(sleep_time) + self.request_count = 0 + self.start_time = time.time() + elif elapsed_time >= 60: + self.request_count = 0 + self.start_time = time.time() + + + def execute_query(self, prompt: str, output_class: Type[T]) -> T: + """Execute a query against the Gemini API and parse the response.""" + self._rate_limit_control() + print(f"\nSending prompt to Gemini (model: {self.model_name})...") + + try: + response = self.model.generate_content(prompt) + except Exception as e: + print(f"Error calling Gemini API: {e}") + # Fallback for errors + if output_class is SummaryOutput: + return SummaryOutput(summary="Error generating summary.") + if output_class is TagsOutput: + return TagsOutput(tags=[]) + if output_class is ClassificationOutput: + return ClassificationOutput(assigned_tag="Error", reason="API call failed.") + if output_class is BatchClassificationOutput: + return BatchClassificationOutput(classifications=[]) + raise + + try: + text = response.text.strip() + # Try to find JSON block, robustly + json_str = None + if output_class is BatchClassificationOutput: # Expects a list + start_brace = text.find("[") + end_brace = text.rfind("]") + 1 # Add 1 to include the closing bracket + else: # Expects an object + start_brace = text.find("{") + end_brace = text.rfind("}") + 1 # Add 1 to include the closing brace + + if start_brace != -1 and end_brace > start_brace: + json_str = text[start_brace:end_brace] + data = json.loads(json_str) + else: + print("No valid JSON found in response.") + raise ValueError("No JSON object/array found in response") + + if output_class is SummaryOutput: + return SummaryOutput(summary=data.get("summary", "Summary not found in response.")) + if output_class is TagsOutput: + return TagsOutput(tags=data.get("tags", [])) + if output_class is ClassificationOutput: # For single classification + return ClassificationOutput( + assigned_tag=data.get("assigned_tag", "Other"), + reason=data.get("reason", "Could not classify project from response.") + ) + if output_class is BatchClassificationOutput: # For batch classification + classifications_data = data # data is already the list + parsed_classifications = [ + ClassificationOutput( + assigned_tag=item.get("assigned_tag", "Other"), + reason=item.get("reason", "Could not classify.") + ) for item in classifications_data + ] + return BatchClassificationOutput(classifications=parsed_classifications) + + raise ValueError(f"Unknown output class: {output_class}") + + except (json.JSONDecodeError, ValueError) as e: + print(f"Error processing Gemini response: {e}. Raw text: '{response.text[:300]}...'") + if output_class is SummaryOutput: + return SummaryOutput(summary="Failed to parse summary from response.") + if output_class is TagsOutput: + return TagsOutput(tags=[]) + if output_class is ClassificationOutput: + return ClassificationOutput(assigned_tag="Other", reason="Failed to parse classification.") + if output_class is BatchClassificationOutput: + # Return empty list of classifications for the batch + return BatchClassificationOutput(classifications=[]) + raise + + def make_summary(self, readme_md: str) -> SummaryOutput: + """Generate a summary of the project based on its README.""" + if not readme_md or not readme_md.strip(): + return SummaryOutput(summary="This appears to be an empty repository without a README file.") + + prompt_template = self.config_manager.get_summary_prompt_template() + prompt = prompt_template.format(readme_md=readme_md) + return self.execute_query(prompt, SummaryOutput) + + def make_tags(self, summary: str) -> TagsOutput: + """Generate tags for the project based on its summary.""" + if not summary or "empty repository" in summary.lower() or "error generating summary" in summary.lower(): + return TagsOutput(tags=[]) + + prompt_template = self.config_manager.get_tags_prompt_template() + prompt = prompt_template.format(summary=summary) + return self.execute_query(prompt, TagsOutput) + + def classify_projects_batch_for_persona( + self, + project_data_batch: List[Dict[str, Any]], # Changed from summaries: List[str] + persona: Dict[str, Any] + ) -> List[ClassificationOutput]: + """ + Classify multiple projects at once for a specific persona using their summaries and metadata. + Each item in project_data_batch is a dict with 'summary', 'star_count', etc. + The persona dictionary should contain 'name', 'title', 'description', and 'prompt' (template). + """ + if not project_data_batch: + return [] + + categories_list_str = "\n".join( + f"- \"{c['category']}\": {c['description']}" # Ensure category names are quoted for clarity in prompt + for c in self.config_manager.get_categories() + ) + + persona_prompt_template = persona.get('prompt') + if not persona_prompt_template: + print(f"Error: Persona '{persona.get('name')}' is missing a prompt template.") + return [ClassificationOutput(assigned_tag="Error", reason="Persona prompt missing")] * len(project_data_batch) + + individual_project_prompts = [] + for i, project_data in enumerate(project_data_batch): + # Prepare metadata for formatting, handling None or NaN + # Ensure star_count and fork_count are numbers, default to 0 if None/NaN + star_count = project_data.get('star_count') + fork_count = project_data.get('fork_count') + + formatted_star_count = int(star_count) if pd.notna(star_count) else 0 + formatted_fork_count = int(fork_count) if pd.notna(fork_count) else 0 + + # Format dates, default to "N/A" if None/NaT + created_at = project_data.get('created_at') + updated_at = project_data.get('updated_at') + + formatted_created_at = str(created_at.date()) if pd.notna(created_at) and hasattr(created_at, 'date') else "N/A" + formatted_updated_at = str(updated_at.date()) if pd.notna(updated_at) and hasattr(updated_at, 'date') else "N/A" + + # Ensure summary is a string + summary_text = project_data.get('summary', "No summary provided.") + if not isinstance(summary_text, str): + summary_text = str(summary_text) + + + try: + # The persona_prompt_template itself contains the persona's role description. + # We just need to format it with the project-specific data. + # The {categories} placeholder in the persona prompt will be filled by this categories_list_str. + formatted_project_section = persona_prompt_template.format( + summary=summary_text, + star_count=formatted_star_count, + fork_count=formatted_fork_count, + created_at=formatted_created_at, + updated_at=formatted_updated_at, + categories=categories_list_str # Pass the formatted list of categories + ) + individual_project_prompts.append(f"--- Project {i+1} ---\n{formatted_project_section}") + except KeyError as e: + print(f"KeyError during prompt formatting for persona {persona.get('name')}, project {project_data.get('repo_artifact_id', 'Unknown')}: {e}") + # Add a placeholder error entry for this project + individual_project_prompts.append(f"--- Project {i+1} ---\nError formatting prompt for this project. Cannot classify.") + + + batch_project_details_str = "\n\n".join(individual_project_prompts) + + # Construct the overall batch prompt + # The persona's title and description can frame the overall task. + persona_title = persona.get('title', persona['name']) + persona_description = persona.get('description', '') + + final_batch_prompt = f"""As {persona_title} ({persona_description}), your task is to review and classify the following {len(project_data_batch)} project(s). +For each project, use the specific instructions and context provided under its section. + +{batch_project_details_str} + +After reviewing all projects, please respond with a single JSON array. Each element in the array should be a JSON object corresponding to one project, in the exact order they were presented above. Each object must contain: +1. "assigned_tag": The category you assigned from the provided list. +2. "reason": A brief explanation for your choice, following the persona's specific instructions. + +Example for two projects: +[ + {{ "assigned_tag": "Category A", "reason": "Reason for project 1..." }}, + {{ "assigned_tag": "Category B", "reason": "Reason for project 2..." }} +] +""" + + batch_output = self.execute_query(final_batch_prompt, BatchClassificationOutput) + + # Ensure the number of classifications matches the number of projects + if len(batch_output.classifications) != len(project_data_batch): + print(f"Warning: Mismatch in number of projects ({len(project_data_batch)}) and classifications ({len(batch_output.classifications)}) for persona {persona['name']}.") + error_classification = ClassificationOutput(assigned_tag="Error", reason="Mismatch in batch processing output length") + # Adjust the length of classifications to match project_data_batch + final_classifications = batch_output.classifications[:len(project_data_batch)] + while len(final_classifications) < len(project_data_batch): + final_classifications.append(error_classification) + batch_output.classifications = final_classifications + + return batch_output.classifications + + +if __name__ == '__main__': + # Example Usage + # Example Usage: + # cfg_manager = ConfigManager() + # ai_service = AIService(config_manager=cfg_manager) + # print("AIService initialized for standalone testing if needed.") + pass diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/fetcher.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/fetcher.py new file mode 100644 index 00000000..68b294c7 --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/fetcher.py @@ -0,0 +1,135 @@ +import base64 +import requests +import pandas as pd +import datetime +from pyoso import Client +from ..config.settings import OSO_API_KEY, GITHUB_HEADERS + +class DataFetcher: + def __init__(self): + self.oso_client = Client(api_key=OSO_API_KEY) + + def fetch_repositories(self, limit: int = None, sort_by_stars: bool = True) -> pd.DataFrame: + """ + Fetch repositories from OSO. + + Args: + limit: Optional limit on number of repositories to fetch. + sort_by_stars: If True, sort repositories by star_count descending. + """ + query = """ + SELECT DISTINCT + re.artifact_id AS repo_artifact_id, + p.project_id, + p.project_name AS atlas_id, + p.display_name, + re.artifact_namespace AS repo_artifact_namespace, + re.artifact_name AS repo_artifact_name, + re.created_at, + re.updated_at, + re.star_count, + re.fork_count, + re.is_fork, + re.num_packages_in_deps_dev + FROM stg_op_atlas_application AS a + JOIN projects_v1 AS p + ON p.project_id = a.project_id + JOIN stg_op_atlas_project_repository AS pr + ON p.project_id = pr.project_id + JOIN int_repositories_enriched AS re + ON re.artifact_namespace = pr.artifact_namespace + AND re.artifact_name = pr.artifact_name + WHERE a.round_id = '7' + """ + # The table int_superchain_s7_devtooling_repositories should have star_count + # If not, this sort will fail or do nothing. Assuming 'r.star_count' is valid. + if sort_by_stars: + query += " ORDER BY re.star_count DESC, p.project_name ASC" + + if limit is not None and isinstance(limit, int) and limit > 0: + query += f" LIMIT {limit}" + + df = self.oso_client.to_pandas(query) + + # Add is_actively_maintained field based on updated_at (active if updated in last year) + # Use naive datetime (no timezone) for comparison + one_year_ago = pd.Timestamp.now().tz_localize(None) - pd.Timedelta(days=365) + + # Convert updated_at to datetime if it's a string + def check_if_active(date): + if pd.isna(date): + return False + + # Convert to datetime if it's a string + if isinstance(date, str): + try: + date = pd.to_datetime(date) + except: + return False + + # Ensure datetime is naive (no timezone) for comparison + if hasattr(date, 'tz_localize'): + if date.tzinfo is not None: + date = date.tz_localize(None) + + # Now compare with one_year_ago + return date > one_year_ago + + df['is_actively_maintained'] = df['updated_at'].apply(check_if_active) + + # Ensure is_fork is a boolean + if 'is_fork' not in df.columns: + print("Warning: 'is_fork' field not available in OSO data. Setting all to False.") + df['is_fork'] = False + else: + # Convert to boolean if it's not already + df['is_fork'] = df['is_fork'].fillna(False).astype(bool) + + return df + + def fetch_readme(self, owner: str, repo: str) -> tuple: + """ + Fetch README.md content from GitHub repository with debug logging. + + Returns: + tuple: (readme_content, status) where status is one of: + "SUCCESS", "EMPTY", or "ERROR" + """ + url = f"https://api.github.com/repos/{owner}/{repo}/readme" + print(f"Fetching README for {owner}/{repo} ...", flush=True) + resp = requests.get(url, headers=GITHUB_HEADERS) + print(f"Status code: {resp.status_code}", flush=True) + if resp.status_code == 200: + data = resp.json() + try: + content = base64.b64decode(data["content"]).decode("utf-8") + if not content.strip(): + print(f"Empty README for {owner}/{repo}", flush=True) + return "", "EMPTY" + print(f"Successfully fetched README for {owner}/{repo}", flush=True) + return content, "SUCCESS" + except Exception as e: + print(f"Error decoding README for {owner}/{repo}: {e}", flush=True) + return "", "ERROR" + else: + print(f"Failed to fetch README for {owner}/{repo}: {resp.text}", flush=True) + return "", "ERROR" + + def get_all_readmes(self, df: pd.DataFrame) -> pd.DataFrame: + """Add README content to the dataframe for each repository with debug logging.""" + print("First 5 repo_artifact_namespace:", df["repo_artifact_namespace"].head().tolist(), flush=True) + print("First 5 repo_artifact_name:", df["repo_artifact_name"].head().tolist(), flush=True) + + # Apply fetch_readme and capture both content and status with progress bar + from tqdm import tqdm + tqdm.pandas(desc="Fetching READMEs") + readme_results = df.progress_apply( + lambda row: self.fetch_readme(row.repo_artifact_namespace, row.repo_artifact_name), + axis=1 + ) + + # Split the results into separate columns + df["readme_md"] = [result[0] for result in readme_results] + df["readme_status"] = [result[1] for result in readme_results] + + return df diff --git a/experiments/ethereum-repo-clusters/pipeline_config.json b/experiments/ethereum-repo-clusters/pipeline_config.json new file mode 100644 index 00000000..e116b26b --- /dev/null +++ b/experiments/ethereum-repo-clusters/pipeline_config.json @@ -0,0 +1,10 @@ +{ + "output_dir": "/Users/cerv1/Dropbox/Kariba/Github/insights/experiments/devtooling_labels/output", + "gemini_model": "gemini-2.0-flash", + "summary_prompt_template": "You are an analyst preparing short, neutral briefs on open-source projects. Read the README below and write a **concise, 2- to 3-sentence summary** that:\n\u2022 states the project\u2019s core purpose / problem it solves\n\u2022 lists its main capabilities or components (1\u20133 key points only)\n\u2022 mentions the primary intended users or systems (e.g., smart-contract developers, node operators)\n\u2022 notes any strongly signalled context such as supported programming language, network, or runtime\n\n**Style constraints**\n\u2022 Use plain, factual language in third person (no hype, no marketing adjectives).\n\u2022 **Do not** guess or invent details that are not explicit in the README.\n\u2022 **Do not** label the project with, or copy wording from, the taxonomy below (to avoid category leakage).\n\u2022 Limit the summary to <100 words; avoid bullet lists or line breaks.\n\nReturn your answer as **exactly one valid JSON object** in this form (nothing extra):\n{{\n \"summary\": \"your summary here\"\n}}\n\nREADME:\n{readme_md}", + "tags_prompt_template": "Based on this project summary, generate a list of relevant tags that describe the project's purpose and functionality.\n\nYou must respond with a valid JSON object in this exact format:\n{{\n \"tags\": [\"tag1\", \"tag2\", \"tag3\"]\n}}\n\nSummary:\n{summary}", + "test_mode": true, + "test_mode_limit": 30, + "batch_size_summaries": 10, + "batch_size_categorization": 10 +} \ No newline at end of file diff --git a/experiments/ethereum-repo-clusters/requirements.txt b/experiments/ethereum-repo-clusters/requirements.txt new file mode 100644 index 00000000..7ab7b91e --- /dev/null +++ b/experiments/ethereum-repo-clusters/requirements.txt @@ -0,0 +1,8 @@ +pandas>=2.0.0 +requests>=2.31.0 +pyoso>=0.1.0 +google-generativeai>=0.3.0 +pydantic>=2.0.0 +python-dotenv>=1.0.0 +click>=8.0.0 +pyarrow>=14.0.0 # For parquet support diff --git a/experiments/ethereum-repo-clusters/setup.py b/experiments/ethereum-repo-clusters/setup.py new file mode 100644 index 00000000..1b43d1c3 --- /dev/null +++ b/experiments/ethereum-repo-clusters/setup.py @@ -0,0 +1,16 @@ +from setuptools import setup, find_packages + +setup( + name="devtooling_labels", + version="0.1.0", + packages=find_packages(), + install_requires=[ + "pandas>=2.0.0", + "requests>=2.31.0", + "pyoso>=0.1.0", + "google-generativeai>=0.3.0", + "pydantic>=2.0.0", + "python-dotenv>=1.0.0", + ], + python_requires=">=3.8", +) \ No newline at end of file From 7dc4853a21bc6cdedd47685021e5736b0d017f50 Mon Sep 17 00:00:00 2001 From: ccerv1 Date: Thu, 5 Jun 2025 13:37:09 -0400 Subject: [PATCH 3/4] fix: update category prompts --- .../config/prompts/categories.py | 106 +++++++++--------- 1 file changed, 56 insertions(+), 50 deletions(-) diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/categories.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/categories.py index 61bae994..dad8df1a 100644 --- a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/categories.py +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/categories.py @@ -1,96 +1,102 @@ CATEGORIES = [ { - "category": "Language & Compilation Tools", + "category": "Lending & Borrowing Protocols", "description": ( - "Language & Compilation Tools include compilers, interpreters, language servers, " - "and syntax utilities for smart-contract development. They translate high-level " - "source code into EVM bytecode, perform static analysis, and enable features like " - "symbolic execution, forming the foundation for all higher-level tooling." + "Lending & Borrowing Protocols include implementations and SDKs for collateralized lending markets, " + "flash loans, interest rate models, and liquidation mechanisms. These tools handle asset management, " + "risk scoring, and pool accounting, enabling users to lend or borrow assets in a trust-minimized way." ) }, { - "category": "Core Protocol Interfaces", + "category": "Decentralized Exchanges (DEXs)", "description": ( - "Core Protocol Interfaces are libraries and SDKs that provide reusable building blocks " - "for blockchain developers—smart contract libraries, JSON-RPC clients, transaction builders, " - "wallet and key management, authorization, signature handling, and ABI encoding/decoding. " - "They can power the core operations of many dApps and services." + "DEXs power peer-to-peer asset swaps and liquidity provision. This includes AMM (automated market maker) " + "frameworks, order book DEXes, routers, aggregators, and liquidity management libraries. They also often " + "support advanced trading mechanisms like TWAPs, limit orders, and MEV protection." ) }, { - "category": "Development Frameworks", + "category": "Derivatives & Synthetic Assets", "description": ( - "Development Frameworks are opinionated, end-to-end toolchains that scaffold, build, " - "test, and deploy smart-contract projects. They bundle CLIs, IDE integrations, task " - "runners, local networks, hot-reloading, and plugin ecosystems to enforce conventions " - "and automate workflows from project setup through to frontend integration." + "Derivatives & Synthetic Assets frameworks implement perpetual futures, options, and collateralized synthetic " + "asset systems. These toolkits involve complex pricing oracles, risk engines, margin systems, and settlement layers." ) }, { - "category": "Deployment & Lifecycle Management", + "category": "Stablecoin Infrastructure", "description": ( - "Deployment & Lifecycle Management tools handle contract deployment, upgrades, and " - "on-chain migrations. They automate predictable CREATE2 strategies, proxy pattern " - "management, cross-network publishes, and governance hooks, while integrating safety " - "checks and test-suite validations to maintain contract integrity." + "Stablecoin Infrastructure includes minting contracts, collateralization engines, algorithmic stabilization mechanisms, " + "and off-chain attestation integrations. It also encompasses tools for analyzing backing ratios and peg health." ) }, { - "category": "Testing & Verification Tools", + "category": "Oracles & Price Feeds", "description": ( - "Testing & Verification Tools provide frameworks for unit testing, property-based fuzzing, " - "symbolic execution, formal verification, and coverage analysis. They integrate vulnerability " - "scanners, static analyzers, and coverage reporters to identify edge-case failures and ensure " - "on-chain correctness." + "Oracles & Price Feeds provide real-world and cross-chain data into smart contracts. This category covers push-based oracles, " + "pull-based on-demand queries, cryptoeconomic staking oracles, and off-chain data relayers." ) }, { - "category": "Developer Experience Tools", + "category": "Vaults, Yield Strategies & Aggregators", "description": ( - "Developer Experience Tools are lightweight plugins and utilities that boost productivity " - "and enforce code consistency. This category includes editor extensions, linters, formatters, " - "code generators, documentation generators, and small CLI helpers." + "These tools optimize capital across yield-bearing protocols. They include yield routers, auto-compounding vaults, and rebalancers, " + "as well as SDKs to model risk-return profiles and dynamically allocate capital across farms and lending markets." ) }, { - "category": "Infrastructure & Node Operations", + "category": "Asset Management & Portfolio Tooling", "description": ( - "Infrastructure & Node Operations encompass tools for running, coordinating, and scaling " - "blockchain nodes and peer-to-peer networks. They cover RPC providers, telemetry collectors, " - "log aggregators, gossip-based messaging layers, peer discovery and connection management, " - "and automation scripts to ensure reliable network participation." + "Asset Management tooling includes interfaces and libraries for building rebalancing strategies, vault-based funds, on-chain ETFs, " + "and automated index trackers. They often incorporate fee structures, role-based access, and compliance checks." ) }, { - "category": "Data Indexing & Analytics", + "category": "DeFi Security & Monitoring", "description": ( - "Data Indexing & Analytics tools ingest, process, and visualize on-chain data. They provide " - "GraphQL and REST APIs over processed datasets, real-time event streaming, and libraries or " - "dashboards for analyzing blockchain metrics." + "Security tools for DeFi include real-time exploit detectors, anomaly detection systems, pause mechanisms, multisig enforcers, " + "and post-mortem forensic tools. Monitoring dashboards and alerting frameworks fall here as well." ) }, { - "category": "Interoperability & Cross-chain", + "category": "Governance & DAO Tooling", "description": ( - "Interoperability & Cross-chain covers bridging frameworks, cross-chain messaging protocols, " - "and Superchain interoperability tooling. These libraries enable seamless asset transfers, " - "state proofs, and communication across multiple networks." + "Governance & DAO Tooling enables on-chain proposal management, token-weighted voting, off-chain signaling, execution queues, " + "and guardrails for DeFi governance systems. Includes snapshot integration, timelocks, and delegate management interfaces." ) }, { - "category": "Cryptography & Primitives", + "category": "Liquidity Bootstrapping & Token Distribution", "description": ( - "Cryptography & Primitives includes low-level cryptographic libraries and building blocks—" - "hash functions, signature schemes, Merkle trees, zero-knowledge proof primitives, and " - "encryption utilities—optimized for security and performance." + "This includes tools for liquidity mining, airdrops, vesting contracts, bonding curves, and initial token offerings. " + "They facilitate community-led distribution, price discovery, and progressive decentralization of DeFi protocols." ) }, { - "category": "Application-Specific & Niche Tools", + "category": "DeFi Analytics & Dashboards", "description": ( - "Application-Specific & Niche Tools are libraries and SDKs tailored to very narrow use cases " - "(e.g., DeFi adapters, NFT marketplaces, governance dashboards). They serve specific projects " - "but do not have broad applicability or reusability across the ecosystem." + "These are SDKs, APIs, and frontends for aggregating on-chain DeFi metrics—TVL, yield, volume, and user activity. " + "Includes data pipelines, Dune-compatible libraries, subgraphs, and event-based ETL infrastructure tailored to DeFi." + ) + }, + { + "category": "Cross-chain DeFi Infrastructure", + "description": ( + "These tools support multi-chain liquidity routing, cross-chain yield farming, state relays, and synthetic asset issuance. " + "They abstract away bridging mechanics, offering seamless user and liquidity migration across ecosystems." + ) + }, + { + "category": "User Interface & Integration SDKs", + "description": ( + "SDKs and frontend libraries for integrating DeFi functionality into wallets, dApps, and aggregators. Includes trade UIs, " + "Zap interfaces, gas estimators, and batch transaction helpers to improve DeFi UX." + ) + }, + { + "category": "Simulation & Risk Modeling", + "description": ( + "Tools that simulate user positions, economic incentives, or protocol upgrades. They model protocol resilience, agent behavior, " + "market shocks, and contagion scenarios, often using agent-based or Monte Carlo methods for risk-aware design." ) }, { From 2dbf38f9aaff7a55cdf39848ded92ddc53972884 Mon Sep 17 00:00:00 2001 From: ccerv1 Date: Fri, 6 Jun 2025 11:16:50 -0400 Subject: [PATCH 4/4] feat: add ef categories --- .../CategorySummary.ipynb | 531 ++++++++++++++++++ experiments/ethereum-repo-clusters/README.md | 71 +++ .../ethereum-repo-clusters/cli/main_cli.py | 3 +- .../config/prompts/categories.py | 97 ++++ .../config/prompts/personas.py | 87 ++- .../pipeline/data_manager.py | 99 ++++ .../pipeline/unified_processor.py | 295 +++++++--- .../processing/fetcher.py | 29 +- .../pipeline_config.json | 2 +- 9 files changed, 1108 insertions(+), 106 deletions(-) create mode 100644 experiments/ethereum-repo-clusters/CategorySummary.ipynb diff --git a/experiments/ethereum-repo-clusters/CategorySummary.ipynb b/experiments/ethereum-repo-clusters/CategorySummary.ipynb new file mode 100644 index 00000000..9ca79d6b --- /dev/null +++ b/experiments/ethereum-repo-clusters/CategorySummary.ipynb @@ -0,0 +1,531 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "9c0861ae-d89b-4f21-a743-f5a77efa7648", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "add99052-fecf-4130-9cc5-b7413c643864", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
repo_artifact_idproject_idproject_namedisplay_namerepo_artifact_namespacerepo_artifact_namecreated_atupdated_atstar_countfork_count...is_actively_maintainedfinal_recommendationprocessing_timestampsummaryreadme_statusprotocol_architectecosystem_analystsecurity_researcheruser_experience_advocategovernance_specialist
0jXXy/fnXRva/c1jf/Weav9O3pWDHf/lVArjj0/oteUM=KLkMfahLmIEtzAbihkJ4U9p0e/3zWn0iN6xBrwN++lU=ethereum-attestation-serviceEthereum Attestation Serviceethereum-attestation-serviceeas-docs-site2022-11-09 19:39:56.000 UTC2025-06-02 15:51:08.000 UTC1739...TrueDeveloper Experience Tools2025-06-06T00:55:35.737447The project provides documentation for the Eth...SUCCESSDeveloper Experience ToolsDeveloper Experience ToolsDeveloper Experience ToolsDeveloper Experience ToolsApplication-Specific & Niche Tools
1Ymt6ZmVh75JL7ml3IM9hU32qFd+GB84kXijLttRFS+w=4JnmAZ2ikJpcjoRMX+2ZLxAQ10don2FQd6yPyDaBZ20=erigontecherigontecherigontechgmp-wasm2020-12-16 08:27:02.000 UTC2025-03-24 16:40:59.000 UTC174...TrueCryptography & Primitives2025-06-06T00:55:27.645719The GNU MP Library provides arbitrary precisio...SUCCESSCryptography & PrimitivesCryptography & PrimitivesCryptography & PrimitivesCryptography & PrimitivesCryptography & Primitives
2sTL/I78T3P6uyVN+En480uSHiTXT7UdHPmKQlvWxCkc=4JnmAZ2ikJpcjoRMX+2ZLxAQ10don2FQd6yPyDaBZ20=erigontecherigontecherigontechdiagnostics2023-02-22 11:05:42.000 UTC2025-04-25 07:42:52.000 UTC1721...TrueInfrastructure & Node Operations2025-06-06T00:55:20.010674The Erigon Diagnostics System is a web applica...SUCCESSInfrastructure & Node OperationsInfrastructure & Node OperationsDeFi Security & MonitoringInfrastructure & Node OperationsInfrastructure & Node Operations
39C23r6x0hqtbR/lB/1nYpc5KVgCtm4ga+lOJa4gd2cY=Fs/BFdYMfeuzzzWPOX3dtOA6Z4AOJsB2eO2JIoZEzUo=ensdomainsENSensdomainscourt2018-05-02 19:41:02.000 UTC2025-05-20 03:41:25.000 UTC177...TrueApplication-Specific & Niche Tools2025-06-06T00:55:11.604116Court provides smart contracts for arbitrating...SUCCESSApplication-Specific & Niche ToolsApplication-Specific & Niche ToolsDeFi Security & MonitoringApplication-Specific & Niche ToolsGovernance & DAO Tooling
4j9aT6b4e9dCsCbJ42JXen90EHik4VhyLFvX2RjeiJGM=Fs/BFdYMfeuzzzWPOX3dtOA6Z4AOJsB2eO2JIoZEzUo=ensdomainsENSensdomainsop-resolver2022-11-03 11:14:36.000 UTC2025-05-20 03:21:33.000 UTC176...TrueInteroperability & Cross-chain2025-06-06T00:55:03.917944The Optimism Resolver project facilitates stor...SUCCESSInfrastructure & Node OperationsInteroperability & Cross-chainInteroperability & Cross-chainUser Interface & Integration SDKsGovernance & DAO Tooling
..................................................................
5234AcuAtRmOCfY1rQN0rAx8iP5pdbgveBahZYSWK2leQq4=AmxsQKHnsygqA+a7WJawinHjVclh84R+edks3EL9jiM=fuellabsFuel Networkfuellabsfuels-rs2021-10-31 22:33:54.000 UTC2025-06-03 17:34:29.000 UTC437471355...TrueDevelopment Frameworks2025-06-05T14:24:14.479181The fuels-rs project provides a Rust SDK for t...SUCCESSCore Protocol InterfacesDevelopment FrameworksDevelopment FrameworksDevelopment FrameworksDevelopment Frameworks
5235JfvNeHojsqThZKXGfbrSSW4JIf2db88eIku67txzj9w=vD6QgU2nKpWiutcCnblDJkVHtDkLDH6oyITV+xpe3+g=go-ethereumgethethereumgo-ethereum2013-12-26 13:05:46.000 UTC2025-06-03 16:54:54.000 UTC4906520888...TrueInfrastructure & Node Operations2025-06-05T14:24:08.096520Go Ethereum (geth) is a Golang implementation ...SUCCESSInfrastructure & Node OperationsInfrastructure & Node OperationsInfrastructure & Node OperationsInfrastructure & Node OperationsInfrastructure & Node Operations
5236imBvQgAogfFYL0+hque3sUxe+dN53nsDQFoz1q1jgDA=AmxsQKHnsygqA+a7WJawinHjVclh84R+edks3EL9jiM=fuellabsFuel Networkfuellabsfuel-core2020-08-27 21:12:14.000 UTC2025-06-03 17:34:30.000 UTC576372852...TrueInfrastructure & Node Operations2025-06-05T14:24:01.176979The Fuel client implements a Fuel node, provid...SUCCESSInfrastructure & Node OperationsInfrastructure & Node OperationsInfrastructure & Node OperationsInfrastructure & Node OperationsInfrastructure & Node Operations
5237XK2KsRrMXU8N9WUAXb0V+X2pZWgx9H2UCtaJ6IONUC4=AmxsQKHnsygqA+a7WJawinHjVclh84R+edks3EL9jiM=fuellabsFuel Networkfuellabssway2021-01-19 20:54:33.000 UTC2025-06-03 17:34:31.000 UTC622555405...TrueLanguage & Compilation Tools2025-06-05T14:23:54.181337Sway is a programming language designed for th...SUCCESSLanguage & Compilation ToolsLanguage & Compilation ToolsLanguage & Compilation ToolsLanguage & Compilation ToolsLanguage & Compilation Tools
5238ACDSfw399At2CyBKEzgNCwOZ3zvC990eWZjGw+Z8isA=cJt1yXO/geeLxyt++Pe5iU+kUyklaoGot3rHqrDNk1o=base-orgBasebase-orgnode2023-02-01 13:55:02.000 UTC2025-02-10 01:22:12.000 UTC685682635...TrueInfrastructure & Node Operations2025-06-05T14:23:47.813647The Base Node project provides Docker configur...SUCCESSInfrastructure & Node OperationsInfrastructure & Node OperationsInfrastructure & Node OperationsInfrastructure & Node OperationsInfrastructure & Node Operations
\n", + "

5239 rows × 22 columns

\n", + "
" + ], + "text/plain": [ + " repo_artifact_id \\\n", + "0 jXXy/fnXRva/c1jf/Weav9O3pWDHf/lVArjj0/oteUM= \n", + "1 Ymt6ZmVh75JL7ml3IM9hU32qFd+GB84kXijLttRFS+w= \n", + "2 sTL/I78T3P6uyVN+En480uSHiTXT7UdHPmKQlvWxCkc= \n", + "3 9C23r6x0hqtbR/lB/1nYpc5KVgCtm4ga+lOJa4gd2cY= \n", + "4 j9aT6b4e9dCsCbJ42JXen90EHik4VhyLFvX2RjeiJGM= \n", + "... ... \n", + "5234 AcuAtRmOCfY1rQN0rAx8iP5pdbgveBahZYSWK2leQq4= \n", + "5235 JfvNeHojsqThZKXGfbrSSW4JIf2db88eIku67txzj9w= \n", + "5236 imBvQgAogfFYL0+hque3sUxe+dN53nsDQFoz1q1jgDA= \n", + "5237 XK2KsRrMXU8N9WUAXb0V+X2pZWgx9H2UCtaJ6IONUC4= \n", + "5238 ACDSfw399At2CyBKEzgNCwOZ3zvC990eWZjGw+Z8isA= \n", + "\n", + " project_id \\\n", + "0 KLkMfahLmIEtzAbihkJ4U9p0e/3zWn0iN6xBrwN++lU= \n", + "1 4JnmAZ2ikJpcjoRMX+2ZLxAQ10don2FQd6yPyDaBZ20= \n", + "2 4JnmAZ2ikJpcjoRMX+2ZLxAQ10don2FQd6yPyDaBZ20= \n", + "3 Fs/BFdYMfeuzzzWPOX3dtOA6Z4AOJsB2eO2JIoZEzUo= \n", + "4 Fs/BFdYMfeuzzzWPOX3dtOA6Z4AOJsB2eO2JIoZEzUo= \n", + "... ... \n", + "5234 AmxsQKHnsygqA+a7WJawinHjVclh84R+edks3EL9jiM= \n", + "5235 vD6QgU2nKpWiutcCnblDJkVHtDkLDH6oyITV+xpe3+g= \n", + "5236 AmxsQKHnsygqA+a7WJawinHjVclh84R+edks3EL9jiM= \n", + "5237 AmxsQKHnsygqA+a7WJawinHjVclh84R+edks3EL9jiM= \n", + "5238 cJt1yXO/geeLxyt++Pe5iU+kUyklaoGot3rHqrDNk1o= \n", + "\n", + " project_name display_name \\\n", + "0 ethereum-attestation-service Ethereum Attestation Service \n", + "1 erigontech erigontech \n", + "2 erigontech erigontech \n", + "3 ensdomains ENS \n", + "4 ensdomains ENS \n", + "... ... ... \n", + "5234 fuellabs Fuel Network \n", + "5235 go-ethereum geth \n", + "5236 fuellabs Fuel Network \n", + "5237 fuellabs Fuel Network \n", + "5238 base-org Base \n", + "\n", + " repo_artifact_namespace repo_artifact_name \\\n", + "0 ethereum-attestation-service eas-docs-site \n", + "1 erigontech gmp-wasm \n", + "2 erigontech diagnostics \n", + "3 ensdomains court \n", + "4 ensdomains op-resolver \n", + "... ... ... \n", + "5234 fuellabs fuels-rs \n", + "5235 ethereum go-ethereum \n", + "5236 fuellabs fuel-core \n", + "5237 fuellabs sway \n", + "5238 base-org node \n", + "\n", + " created_at updated_at star_count \\\n", + "0 2022-11-09 19:39:56.000 UTC 2025-06-02 15:51:08.000 UTC 17 \n", + "1 2020-12-16 08:27:02.000 UTC 2025-03-24 16:40:59.000 UTC 17 \n", + "2 2023-02-22 11:05:42.000 UTC 2025-04-25 07:42:52.000 UTC 17 \n", + "3 2018-05-02 19:41:02.000 UTC 2025-05-20 03:41:25.000 UTC 17 \n", + "4 2022-11-03 11:14:36.000 UTC 2025-05-20 03:21:33.000 UTC 17 \n", + "... ... ... ... \n", + "5234 2021-10-31 22:33:54.000 UTC 2025-06-03 17:34:29.000 UTC 43747 \n", + "5235 2013-12-26 13:05:46.000 UTC 2025-06-03 16:54:54.000 UTC 49065 \n", + "5236 2020-08-27 21:12:14.000 UTC 2025-06-03 17:34:30.000 UTC 57637 \n", + "5237 2021-01-19 20:54:33.000 UTC 2025-06-03 17:34:31.000 UTC 62255 \n", + "5238 2023-02-01 13:55:02.000 UTC 2025-02-10 01:22:12.000 UTC 68568 \n", + "\n", + " fork_count ... is_actively_maintained \\\n", + "0 39 ... True \n", + "1 4 ... True \n", + "2 21 ... True \n", + "3 7 ... True \n", + "4 6 ... True \n", + "... ... ... ... \n", + "5234 1355 ... True \n", + "5235 20888 ... True \n", + "5236 2852 ... True \n", + "5237 5405 ... True \n", + "5238 2635 ... True \n", + "\n", + " final_recommendation processing_timestamp \\\n", + "0 Developer Experience Tools 2025-06-06T00:55:35.737447 \n", + "1 Cryptography & Primitives 2025-06-06T00:55:27.645719 \n", + "2 Infrastructure & Node Operations 2025-06-06T00:55:20.010674 \n", + "3 Application-Specific & Niche Tools 2025-06-06T00:55:11.604116 \n", + "4 Interoperability & Cross-chain 2025-06-06T00:55:03.917944 \n", + "... ... ... \n", + "5234 Development Frameworks 2025-06-05T14:24:14.479181 \n", + "5235 Infrastructure & Node Operations 2025-06-05T14:24:08.096520 \n", + "5236 Infrastructure & Node Operations 2025-06-05T14:24:01.176979 \n", + "5237 Language & Compilation Tools 2025-06-05T14:23:54.181337 \n", + "5238 Infrastructure & Node Operations 2025-06-05T14:23:47.813647 \n", + "\n", + " summary readme_status \\\n", + "0 The project provides documentation for the Eth... SUCCESS \n", + "1 The GNU MP Library provides arbitrary precisio... SUCCESS \n", + "2 The Erigon Diagnostics System is a web applica... SUCCESS \n", + "3 Court provides smart contracts for arbitrating... SUCCESS \n", + "4 The Optimism Resolver project facilitates stor... SUCCESS \n", + "... ... ... \n", + "5234 The fuels-rs project provides a Rust SDK for t... SUCCESS \n", + "5235 Go Ethereum (geth) is a Golang implementation ... SUCCESS \n", + "5236 The Fuel client implements a Fuel node, provid... SUCCESS \n", + "5237 Sway is a programming language designed for th... SUCCESS \n", + "5238 The Base Node project provides Docker configur... SUCCESS \n", + "\n", + " protocol_architect ecosystem_analyst \\\n", + "0 Developer Experience Tools Developer Experience Tools \n", + "1 Cryptography & Primitives Cryptography & Primitives \n", + "2 Infrastructure & Node Operations Infrastructure & Node Operations \n", + "3 Application-Specific & Niche Tools Application-Specific & Niche Tools \n", + "4 Infrastructure & Node Operations Interoperability & Cross-chain \n", + "... ... ... \n", + "5234 Core Protocol Interfaces Development Frameworks \n", + "5235 Infrastructure & Node Operations Infrastructure & Node Operations \n", + "5236 Infrastructure & Node Operations Infrastructure & Node Operations \n", + "5237 Language & Compilation Tools Language & Compilation Tools \n", + "5238 Infrastructure & Node Operations Infrastructure & Node Operations \n", + "\n", + " security_researcher user_experience_advocate \\\n", + "0 Developer Experience Tools Developer Experience Tools \n", + "1 Cryptography & Primitives Cryptography & Primitives \n", + "2 DeFi Security & Monitoring Infrastructure & Node Operations \n", + "3 DeFi Security & Monitoring Application-Specific & Niche Tools \n", + "4 Interoperability & Cross-chain User Interface & Integration SDKs \n", + "... ... ... \n", + "5234 Development Frameworks Development Frameworks \n", + "5235 Infrastructure & Node Operations Infrastructure & Node Operations \n", + "5236 Infrastructure & Node Operations Infrastructure & Node Operations \n", + "5237 Language & Compilation Tools Language & Compilation Tools \n", + "5238 Infrastructure & Node Operations Infrastructure & Node Operations \n", + "\n", + " governance_specialist \n", + "0 Application-Specific & Niche Tools \n", + "1 Cryptography & Primitives \n", + "2 Infrastructure & Node Operations \n", + "3 Governance & DAO Tooling \n", + "4 Governance & DAO Tooling \n", + "... ... \n", + "5234 Development Frameworks \n", + "5235 Infrastructure & Node Operations \n", + "5236 Infrastructure & Node Operations \n", + "5237 Language & Compilation Tools \n", + "5238 Infrastructure & Node Operations \n", + "\n", + "[5239 rows x 22 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_parquet('output/ethereum_repos_unified.parquet')\n", + "df[\"categorizations_list\"] = df[\"categorizations_json\"].apply(json.loads)\n", + "\n", + "def persona_to_category_map(cats_list):\n", + " return { d[\"persona_name\"]: d[\"category\"] for d in cats_list }\n", + "df_persona_map = df[\"categorizations_list\"].apply(persona_to_category_map)\n", + "df_persona_cols = pd.json_normalize(df_persona_map)\n", + "\n", + "df = df.join(df_persona_cols)\n", + "df = df.drop(columns=[\"categorizations_list\", \"categorizations_json\", \"readme_md\"])\n", + "#df.to_csv('categorizations.csv')\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "368d46ba-cedc-455d-a246-925b6c996090", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/experiments/ethereum-repo-clusters/README.md b/experiments/ethereum-repo-clusters/README.md index 3a60aa1b..5f5db141 100644 --- a/experiments/ethereum-repo-clusters/README.md +++ b/experiments/ethereum-repo-clusters/README.md @@ -29,6 +29,9 @@ The entire process is managed via a Command Line Interface (CLI). - Outputs data at various stages in Parquet and CSV formats (with README text removed from CSV for readability). - Supports easy resumption of processing and addition of new repositories. - Features comprehensive progress bars at multiple levels for better visibility into processing status. +- **Checkpoint System**: Automatically saves progress after each step, allowing for seamless recovery from interruptions. +- **Incremental Saving**: Saves results after processing each repository, ensuring no work is lost if the process is interrupted. +- **Resume Capability**: Automatically detects partially processed repositories and continues from where it left off. ## Prerequisites @@ -206,6 +209,9 @@ The unified processor offers several advantages: - Timestamps for all operations for better traceability - Detailed progress bars for tracking processing status at multiple levels - CSV output with README text removed for improved readability +- Checkpoint system that saves progress after each step +- Incremental saving that preserves work even if interrupted +- Automatic resume capability that continues from where it left off ## Output Files @@ -223,6 +229,7 @@ All output data is stored in the directory specified by `output_dir` in `pipelin - **`ethereum_repos_unified.parquet`**: Comprehensive dataset containing all repositories with their metadata, summaries, and categorizations in a single structure. - **`ethereum_repos_unified.csv`**: A CSV version of the unified data for easier viewing, with README text removed and long text fields truncated for readability. +- **`processing_checkpoint.json`**: Checkpoint file that tracks processing progress, allowing for seamless recovery from interruptions. ### Unified Data Structure @@ -329,3 +336,67 @@ The unified processor handles errors gracefully: - API errors during categorization: The specific persona's categorization is marked as "UNCATEGORIZED" with the error reason. This approach ensures that all repositories are included in the final output, even if they couldn't be fully processed. + +## Checkpoint System + +The unified processor now includes a robust checkpoint system that makes it resilient to interruptions: + +### How It Works + +1. **Incremental Saving**: Results are saved after processing each repository, not just at the end. +2. **Checkpoint File**: A JSON file (`output/processing_checkpoint.json`) tracks: + - Which repositories have been fully processed + - Which repositories are partially processed and their current state + - The last repository that was successfully processed + +3. **Granular Progress Tracking**: The checkpoint tracks progress at multiple levels: + - README fetching status + - Summary generation status + - Which personas have completed categorization + +4. **Resume Logic**: When restarted after an interruption, the processor: + - Skips repositories that have been fully processed + - Continues from where it left off for partially processed repositories + - Preserves all work that was completed before the interruption + +5. **Space Optimization**: Once a repository is fully processed, its partial results are removed from the checkpoint file to save space. + +### Benefits + +- **No Lost Work**: Even if interrupted during a long-running process, no work is lost. +- **API Efficiency**: Avoids redundant API calls to GitHub and Gemini, saving rate limits and costs. +- **Time Savings**: Picks up exactly where it left off, avoiding redundant processing. +- **Resilience**: Handles network issues, API timeouts, and other temporary failures gracefully. + +### Example Checkpoint Structure + +```json +{ + "last_processed_repo_id": "ethereum/solidity", + "processed_repos": ["openzeppelin/openzeppelin-contracts", "ethereum/solidity"], + "partial_results": { + "ipfs/kubo": { + "readme_fetched": true, + "readme_status": "SUCCESS", + "summary_generated": true, + "personas_completed": ["protocol_architect", "ecosystem_analyst"], + "categorizations": [ + { + "persona_name": "protocol_architect", + "category": "Infrastructure & Node Operations", + "reason": "...", + "timestamp": "2025-06-05T13:53:30.903574" + }, + { + "persona_name": "ecosystem_analyst", + "category": "Infrastructure & Node Operations", + "reason": "...", + "timestamp": "2025-06-05T13:53:32.238039" + } + ] + } + } +} +``` + +This checkpoint system ensures that the processing pipeline is robust and can handle interruptions gracefully, making it suitable for processing large numbers of repositories over extended periods. diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/cli/main_cli.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/cli/main_cli.py index 9661a7cb..6a1a8ddd 100644 --- a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/cli/main_cli.py +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/cli/main_cli.py @@ -160,6 +160,7 @@ def process_unified_command(ctx, force_refresh, include_forks, include_inactive, @click.option('--use-unified', is_flag=True, help='Use the new unified processor instead of the legacy pipeline.') @click.option('--include-forks', is_flag=True, help='Include forked repositories (only with --use-unified).') @click.option('--include-inactive', is_flag=True, help='Include inactive repositories (only with --use-unified).') +@click.option('--limit', type=int, help='Limit the number of repositories to process (only with --use-unified).') @click.pass_context def run_all_command(ctx, force_refresh_all, force_refresh_repos, force_refresh_summaries, force_refresh_categories, use_unified, include_forks, include_inactive): @@ -172,7 +173,7 @@ def run_all_command(ctx, force_refresh_all, force_refresh_repos, force_refresh_s force_refresh=force_refresh_all, include_forks=include_forks, include_inactive=include_inactive, - limit=None + limit=limit ) else: print("Executing: Run All Pipeline Steps (Legacy)") diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/categories.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/categories.py index dad8df1a..bd201d31 100644 --- a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/categories.py +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/categories.py @@ -1,4 +1,5 @@ CATEGORIES = [ + # DeFi Application Categories { "category": "Lending & Borrowing Protocols", "description": ( @@ -99,6 +100,102 @@ "market shocks, and contagion scenarios, often using agent-based or Monte Carlo methods for risk-aware design." ) }, + + # Developer Tool Categories + { + "category": "Language & Compilation Tools", + "description": ( + "Language & Compilation Tools include compilers, interpreters, language servers, " + "and syntax utilities for smart-contract development. They translate high-level " + "source code into EVM bytecode, perform static analysis, and enable features like " + "symbolic execution, forming the foundation for all higher-level tooling." + ) + }, + { + "category": "Core Protocol Interfaces", + "description": ( + "Core Protocol Interfaces are libraries and SDKs that provide reusable building blocks " + "for blockchain developers—smart contract libraries, JSON-RPC clients, transaction builders, " + "wallet and key management, authorization, signature handling, and ABI encoding/decoding. " + "They can power the core operations of many dApps and services." + ) + }, + { + "category": "Development Frameworks", + "description": ( + "Development Frameworks are opinionated, end-to-end toolchains that scaffold, build, " + "test, and deploy smart-contract projects. They bundle CLIs, IDE integrations, task " + "runners, local networks, hot-reloading, and plugin ecosystems to enforce conventions " + "and automate workflows from project setup through to frontend integration." + ) + }, + { + "category": "Deployment & Lifecycle Management", + "description": ( + "Deployment & Lifecycle Management tools handle contract deployment, upgrades, and " + "on-chain migrations. They automate predictable CREATE2 strategies, proxy pattern " + "management, cross-network publishes, and governance hooks, while integrating safety " + "checks and test-suite validations to maintain contract integrity." + ) + }, + { + "category": "Testing & Verification Tools", + "description": ( + "Testing & Verification Tools provide frameworks for unit testing, property-based fuzzing, " + "symbolic execution, formal verification, and coverage analysis. They integrate vulnerability " + "scanners, static analyzers, and coverage reporters to identify edge-case failures and ensure " + "on-chain correctness." + ) + }, + { + "category": "Developer Experience Tools", + "description": ( + "Developer Experience Tools are lightweight plugins and utilities that boost productivity " + "and enforce code consistency. This category includes editor extensions, linters, formatters, " + "code generators, documentation generators, and small CLI helpers." + ) + }, + { + "category": "Infrastructure & Node Operations", + "description": ( + "Infrastructure & Node Operations encompass tools for running, coordinating, and scaling " + "blockchain nodes and peer-to-peer networks. They cover RPC providers, telemetry collectors, " + "log aggregators, gossip-based messaging layers, peer discovery and connection management, " + "and automation scripts to ensure reliable network participation." + ) + }, + { + "category": "Data Indexing & Analytics", + "description": ( + "Data Indexing & Analytics tools ingest, process, and visualize on-chain data. They provide " + "GraphQL and REST APIs over processed datasets, real-time event streaming, and libraries or " + "dashboards for analyzing blockchain metrics." + ) + }, + { + "category": "Interoperability & Cross-chain", + "description": ( + "Interoperability & Cross-chain covers bridging frameworks, cross-chain messaging protocols, " + "and Superchain interoperability tooling. These libraries enable seamless asset transfers, " + "state proofs, and communication across multiple networks." + ) + }, + { + "category": "Cryptography & Primitives", + "description": ( + "Cryptography & Primitives includes low-level cryptographic libraries and building blocks—" + "hash functions, signature schemes, Merkle trees, zero-knowledge proof primitives, and " + "encryption utilities—optimized for security and performance." + ) + }, + { + "category": "Application-Specific & Niche Tools", + "description": ( + "Application-Specific & Niche Tools are libraries and SDKs tailored to very narrow use cases " + "(e.g., DeFi adapters, NFT marketplaces, governance dashboards). They serve specific projects " + "but do not have broad applicability or reusability across the ecosystem." + ) + }, { "category": "Others", "description": ( diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/personas.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/personas.py index 1ad6ab82..209515cf 100644 --- a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/personas.py +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/personas.py @@ -1,64 +1,109 @@ PERSONAS = [ { - "name": "keyword_spotter", - "title": "Keyword Spotter", + "name": "protocol_architect", + "title": "Protocol & Infrastructure Architect", "description": ( - "You focus on explicit keywords in summaries and metadata to quickly map " - "projects to the most likely category." + "You evaluate projects based on their technical architecture, infrastructure role, " + "and protocol design patterns. You focus on how well the project implements DeFi primitives, " + "contributes to ecosystem stability, and maintains technical dependencies." ), "prompt": ( - "As a Keyword Spotter, scan the project summary and metadata for tell-tale terms.\n\n" + "As a Protocol & Infrastructure Architect, analyze the project's technical foundations, " + "infrastructure role, and protocol design.\n\n" "Summary: {summary}\n" "Stars: {star_count} | Forks: {fork_count}\n" "Created: {created_at} | Updated: {updated_at}\n\n" - "Based on these details, choose one of the categories below:\n" + "Based on the technical architecture, infrastructure contribution, and protocol design, " + "choose one of the categories below:\n" "{categories}\n\n" "Respond in JSON:\n" "{{\n" ' "assigned_tag": "category name",\n' - ' "reason": "which keywords influenced your decision"\n' + ' "reason": "analysis of protocol architecture, infrastructure role, technical dependencies, and ecosystem stability"\n' "}}" ), }, { - "name": "senior_strategist", - "title": "Senior Strategist", + "name": "ecosystem_analyst", + "title": "Ecosystem Growth Analyst", "description": ( - "You take a broad, long-term view—considering maturity, community traction, " - "and ecosystem fit—to carefully assign the most appropriate category." + "You assess projects based on their potential to grow the Ethereum DeFi ecosystem, " + "their user adoption metrics, and their contribution to composability and innovation." ), "prompt": ( - "As a Senior Strategist, evaluate the project’s maturity, adoption, and fit.\n\n" + "As an Ecosystem Growth Analyst, evaluate the project's impact on DeFi ecosystem growth.\n\n" "Summary: {summary}\n" "Stars: {star_count} | Forks: {fork_count}\n" "Created: {created_at} | Updated: {updated_at}\n\n" - "Select one of the categories below:\n" + "Select the category that best represents its ecosystem role:\n" "{categories}\n\n" "Respond in JSON:\n" "{{\n" ' "assigned_tag": "category name",\n' - ' "reason": "holistic rationale covering maturity, adoption, and ecosystem utility"\n' + ' "reason": "analysis of ecosystem impact, adoption potential, and composability"\n' "}}" ), }, { - "name": "workflow_wizard", - "title": "Workflow Wizard", + "name": "security_researcher", + "title": "Security & Risk Researcher", "description": ( - "You imagine the ideal developer journey—setup, day-to-day ergonomics, " - "and integration—and assign the category that feels most intuitive." + "You focus on security practices, risk management approaches, and the project's " + "contribution to making DeFi safer and more resilient." ), "prompt": ( - "As a Workflow Wizard, envision how a developer would onboard and use this tool.\n\n" + "As a Security & Risk Researcher, assess the project's security posture and risk management.\n\n" "Summary: {summary}\n" "Stars: {star_count} | Forks: {fork_count}\n" "Created: {created_at} | Updated: {updated_at}\n\n" - "Choose the category that best supports a seamless workflow:\n" + "Choose the category that best reflects its security and risk management approach:\n" "{categories}\n\n" "Respond in JSON:\n" "{{\n" ' "assigned_tag": "category name",\n' - ' "reason": "analysis based on developer ergonomics and workflow"\n' + ' "reason": "analysis of security practices, risk management, and safety features"\n' + "}}" + ), + }, + { + "name": "user_experience_advocate", + "title": "User Experience Advocate", + "description": ( + "You evaluate projects based on their user experience, accessibility, and potential " + "to onboard new users to DeFi. You focus on usability and integration capabilities." + ), + "prompt": ( + "As a User Experience Advocate, assess the project's usability and accessibility.\n\n" + "Summary: {summary}\n" + "Stars: {star_count} | Forks: {fork_count}\n" + "Created: {created_at} | Updated: {updated_at}\n\n" + "Select the category that best represents its user experience focus:\n" + "{categories}\n\n" + "Respond in JSON:\n" + "{{\n" + ' "assigned_tag": "category name",\n' + ' "reason": "analysis of user experience, accessibility, and onboarding potential"\n' + "}}" + ), + }, + { + "name": "governance_specialist", + "title": "Governance & Decentralization Specialist", + "description": ( + "You analyze projects based on their governance mechanisms, decentralization approach, " + "and contribution to sustainable protocol management." + ), + "prompt": ( + "As a Governance & Decentralization Specialist, evaluate the project's governance model.\n\n" + "Summary: {summary}\n" + "Stars: {star_count} | Forks: {fork_count}\n" + "Created: {created_at} | Updated: {updated_at}\n\n" + "Choose the category that best reflects its governance and decentralization approach:\n" + "{categories}\n\n" + "Respond in JSON:\n" + "{{\n" + ' "assigned_tag": "category name",\n' + ' "reason": "analysis of governance mechanisms, decentralization, and sustainability"\n' "}}" ), } diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/data_manager.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/data_manager.py index 4c8eef66..035e1caf 100644 --- a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/data_manager.py +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/data_manager.py @@ -282,6 +282,62 @@ def get_unified_data(self) -> pd.DataFrame: return data + def append_unified_data(self, new_repo_data: pd.DataFrame) -> None: + """ + Append a single repository or multiple repositories to the existing unified data. + + Args: + new_repo_data: DataFrame containing the new repository data to append + """ + if new_repo_data.empty: + return + + existing_data = self.get_unified_data() + + if existing_data.empty: + # If no existing data, just save the new data + self.save_unified_data(new_repo_data) + return + + # Combine existing and new data + combined_data = pd.concat([existing_data, new_repo_data], ignore_index=True) + + # Remove duplicates based on repo_artifact_id, keeping the newest version + combined_data = combined_data.sort_values('processing_timestamp', ascending=False) + combined_data = combined_data.drop_duplicates(subset=['repo_artifact_id'], keep='first') + + # Save the combined data + self.save_unified_data(combined_data) + + def update_unified_data(self, updated_repo_data: pd.DataFrame) -> None: + """ + Update specific repositories in the existing unified data. + + Args: + updated_repo_data: DataFrame containing the updated repository data + """ + if updated_repo_data.empty: + return + + existing_data = self.get_unified_data() + + if existing_data.empty: + # If no existing data, just save the updated data + self.save_unified_data(updated_repo_data) + return + + # Get the repo_artifact_ids of the updated repositories + updated_ids = set(updated_repo_data['repo_artifact_id']) + + # Remove the repositories that are being updated from the existing data + filtered_existing = existing_data[~existing_data['repo_artifact_id'].isin(updated_ids)] + + # Combine the filtered existing data with the updated data + combined_data = pd.concat([filtered_existing, updated_repo_data], ignore_index=True) + + # Save the combined data + self.save_unified_data(combined_data) + def wipe_unified_data(self): """Wipe unified data files""" if self.unified_parquet_path.exists(): @@ -290,3 +346,46 @@ def wipe_unified_data(self): if self.unified_csv_path.exists(): self.unified_csv_path.unlink() print(f"Wiped unified CSV data: {self.unified_csv_path}") + + def get_checkpoint_path(self) -> Path: + """Get the path to the processing checkpoint file""" + local_output_dir = Path(PROJECT_ROOT) / "output" + local_output_dir.mkdir(parents=True, exist_ok=True) + return local_output_dir / "processing_checkpoint.json" + + def save_checkpoint(self, checkpoint_data: Dict[str, Any]) -> None: + """ + Save the processing checkpoint data to a JSON file. + + Args: + checkpoint_data: Dictionary containing checkpoint information + """ + checkpoint_path = self.get_checkpoint_path() + with open(checkpoint_path, 'w') as f: + json.dump(checkpoint_data, f, indent=2) + + def load_checkpoint(self) -> Dict[str, Any]: + """ + Load the processing checkpoint data from a JSON file. + + Returns: + Dictionary containing checkpoint information, or empty dict if no checkpoint exists + """ + checkpoint_path = self.get_checkpoint_path() + if not checkpoint_path.exists(): + return { + "last_processed_repo_id": None, + "processed_repos": [], + "partial_results": {} + } + + try: + with open(checkpoint_path, 'r') as f: + return json.load(f) + except Exception as e: + print(f"Error loading checkpoint: {e}") + return { + "last_processed_repo_id": None, + "processed_repos": [], + "partial_results": {} + } diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/unified_processor.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/unified_processor.py index c5f310c7..d669fa76 100644 --- a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/unified_processor.py +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/unified_processor.py @@ -1,7 +1,8 @@ import pandas as pd import datetime import json -from typing import List, Dict, Any, Optional +import time +from typing import List, Dict, Any, Optional, Set from tqdm import tqdm from .data_manager import DataManager from ..config.config_manager import ConfigManager @@ -44,13 +45,14 @@ def run(self, self.config_manager.get_batch_size_categorization() ) - # Get existing data if not forcing refresh + # Load checkpoint or initialize a new one if force_refresh: - print("Force refresh enabled. Wiping existing data.") - self.data_manager.wipe_repos_data() + print("Force refresh enabled. Wiping existing data and checkpoint.") + self.data_manager.wipe_unified_data() + self._initialize_checkpoint() existing_df = pd.DataFrame() else: - existing_df = self.data_manager.get_repos_data() + existing_df = self.data_manager.get_unified_data() if not existing_df.empty: print(f"Found existing data with {len(existing_df)} repositories.") @@ -73,34 +75,36 @@ def run(self, repos_df = repos_df[repos_df['is_actively_maintained']] print(f"Filtered out inactive repositories. {len(repos_df)} repositories remaining.") + # Load checkpoint to determine which repositories need processing + checkpoint = self.data_manager.load_checkpoint() + processed_repos = set(checkpoint.get("processed_repos", [])) + # Determine which repositories need processing - if not existing_df.empty and not force_refresh: - # Identify repositories that have already been fully processed - processed_repos = set() - if 'categorizations' in existing_df.columns: - processed_repos = set( - existing_df[existing_df['categorizations'].apply(lambda x: isinstance(x, list) and len(x) > 0)]['repo_artifact_id'] - ) - + if not force_refresh: # Filter out already processed repositories repos_to_process = repos_df[~repos_df['repo_artifact_id'].isin(processed_repos)] print(f"Found {len(repos_to_process)} repositories that need processing.") - # Combine with existing data for final output - combined_df = pd.concat([ - existing_df[existing_df['repo_artifact_id'].isin(processed_repos)], - self._process_repositories(repos_to_process, batch_size) - ], ignore_index=True) + # Process the repositories + processed_df = self._process_repositories(repos_to_process, batch_size) - # Save the combined data - self.data_manager.save_unified_data(combined_df) - return combined_df + # Return the combined data (existing + newly processed) + return self.data_manager.get_unified_data() else: # Process all repositories processed_df = self._process_repositories(repos_df, batch_size) - self.data_manager.save_unified_data(processed_df) - return processed_df + return self.data_manager.get_unified_data() + def _initialize_checkpoint(self): + """Initialize a new checkpoint file""" + checkpoint = { + "last_processed_repo_id": None, + "processed_repos": [], + "partial_results": {} + } + self.data_manager.save_checkpoint(checkpoint) + print("Initialized new processing checkpoint.") + def _process_repositories(self, repos_df: pd.DataFrame, batch_size: int) -> pd.DataFrame: """ Process repositories in batches: fetch READMEs, generate summaries, and categorize. @@ -120,6 +124,11 @@ def _process_repositories(self, repos_df: pd.DataFrame, batch_size: int) -> pd.D print("No personas found for categorization.") return repos_df + # Load checkpoint + checkpoint = self.data_manager.load_checkpoint() + processed_repos = set(checkpoint.get("processed_repos", [])) + partial_results = checkpoint.get("partial_results", {}) + # Process in batches all_processed_data = [] @@ -127,41 +136,114 @@ def _process_repositories(self, repos_df: pd.DataFrame, batch_size: int) -> pd.D end_idx = min(start_idx + batch_size, len(repos_df)) batch_df = repos_df.iloc[start_idx:end_idx].copy() - # Fetch READMEs for this batch - batch_df = self.fetcher.get_all_readmes(batch_df) - - # Initialize the categorizations column with empty lists - batch_df['categorizations'] = [[] for _ in range(len(batch_df))] - batch_df['final_recommendation'] = 'UNCATEGORIZED' - batch_df['processing_timestamp'] = datetime.datetime.now().isoformat() - batch_df['summary'] = '' - # Process each repository in the batch for idx, row in tqdm(batch_df.iterrows(), desc="Processing repositories in batch", total=len(batch_df), leave=False): - # Initialize categorizations list - categorizations = [] + repo_id = row.get('repo_artifact_id') + repo_name = row.get('repo_artifact_name', 'repo') - # Get README status - readme_status = row.get('readme_status', 'ERROR') + # Skip if already fully processed + if repo_id in processed_repos: + print(f"Skipping {repo_name} (already processed)") + continue - # Generate summary if README is available - summary = "" - if readme_status == "SUCCESS": - readme_content = row.get('readme_md', "") - summary_output: SummaryOutput = self.ai_service.make_summary(readme_content) - summary = summary_output.summary - - # Categorize with each persona - for persona in tqdm(personas, desc=f"Categorizing {row.get('repo_artifact_name', 'repo')} with personas", leave=False): + # Get partial progress for this repository + partial = partial_results.get(repo_id, {}) + + # Initialize repository data + repo_data = row.to_dict() + repo_data['categorizations'] = [] + repo_data['final_recommendation'] = 'UNCATEGORIZED' + repo_data['processing_timestamp'] = datetime.datetime.now().isoformat() + repo_data['summary'] = '' + + # Fetch README if needed + if not partial.get('readme_fetched', False): + try: + print(f"Fetching README for {repo_name}...") + readme_content, readme_status = self.fetcher.fetch_readme( + repo_data['repo_artifact_namespace'], + repo_data['repo_artifact_name'] + ) + repo_data['readme_md'] = readme_content + repo_data['readme_status'] = readme_status + + # Update checkpoint + partial['readme_fetched'] = True + partial['readme_status'] = repo_data['readme_status'] + partial_results[repo_id] = partial + checkpoint['partial_results'] = partial_results + self.data_manager.save_checkpoint(checkpoint) + except Exception as e: + print(f"Error fetching README for {repo_name}: {e}") + repo_data['readme_md'] = '' + repo_data['readme_status'] = 'ERROR' + + # Update checkpoint + partial['readme_fetched'] = True + partial['readme_status'] = 'ERROR' + partial_results[repo_id] = partial + checkpoint['partial_results'] = partial_results + self.data_manager.save_checkpoint(checkpoint) + else: + # Use README status from checkpoint + repo_data['readme_status'] = partial.get('readme_status', 'ERROR') + + # Generate summary if needed + if not partial.get('summary_generated', False) and repo_data['readme_status'] == 'SUCCESS': + try: + print(f"Generating summary for {repo_name}...") + readme_content = repo_data.get('readme_md', '') + summary_output: SummaryOutput = self.ai_service.make_summary(readme_content) + repo_data['summary'] = summary_output.summary + + # Update checkpoint + partial['summary_generated'] = True + partial['summary'] = summary_output.summary + partial_results[repo_id] = partial + checkpoint['partial_results'] = partial_results + self.data_manager.save_checkpoint(checkpoint) + except Exception as e: + print(f"Error generating summary for {repo_name}: {e}") + repo_data['summary'] = '' + + # Update checkpoint + partial['summary_generated'] = True # Mark as attempted + partial_results[repo_id] = partial + checkpoint['partial_results'] = partial_results + self.data_manager.save_checkpoint(checkpoint) + elif partial.get('summary_generated', False) and 'summary' in partial: + # Use summary from checkpoint + repo_data['summary'] = partial.get('summary', '') + + # Initialize personas completed + if 'personas_completed' not in partial: + partial['personas_completed'] = [] + + # Initialize categorizations + categorizations = [] + + # Categorize with each persona if README is available + if repo_data['readme_status'] == 'SUCCESS': + for persona in tqdm(personas, desc=f"Categorizing {repo_name} with personas", leave=False): + # Skip if already categorized by this persona + if persona['name'] in partial.get('personas_completed', []): + # Use existing categorization from checkpoint + if 'categorizations' in partial: + for cat in partial['categorizations']: + if cat['persona_name'] == persona['name']: + categorizations.append(cat) + break + continue + try: # Prepare project data for categorization project_data = { - 'summary': summary, - 'repo_artifact_id': row.get('repo_artifact_id', 'UNKNOWN_ID'), - 'star_count': row.get('star_count', 0), - 'fork_count': row.get('fork_count', 0), - 'created_at': row.get('created_at'), - 'updated_at': row.get('updated_at') + 'summary': repo_data['summary'], + 'repo_artifact_id': repo_id, + 'star_count': repo_data.get('star_count', 0), + 'fork_count': repo_data.get('fork_count', 0), + 'created_at': repo_data.get('created_at'), + 'updated_at': repo_data.get('updated_at') } # Get categorization from this persona @@ -172,53 +254,124 @@ def _process_repositories(self, repos_df: pd.DataFrame, batch_size: int) -> pd.D if classifications and len(classifications) > 0: classification = classifications[0] - categorizations.append({ + cat_entry = { 'persona_name': persona['name'], 'category': classification.assigned_tag, 'reason': classification.reason, 'timestamp': datetime.datetime.now().isoformat() - }) + } + categorizations.append(cat_entry) + + # Update checkpoint + if 'categorizations' not in partial: + partial['categorizations'] = [] + partial['categorizations'].append(cat_entry) + partial['personas_completed'].append(persona['name']) + partial_results[repo_id] = partial + checkpoint['partial_results'] = partial_results + self.data_manager.save_checkpoint(checkpoint) else: - categorizations.append({ + cat_entry = { 'persona_name': persona['name'], 'category': 'UNCATEGORIZED', 'reason': 'Failed to get classification from AI service', 'timestamp': datetime.datetime.now().isoformat() - }) + } + categorizations.append(cat_entry) + + # Update checkpoint + if 'categorizations' not in partial: + partial['categorizations'] = [] + partial['categorizations'].append(cat_entry) + partial['personas_completed'].append(persona['name']) + partial_results[repo_id] = partial + checkpoint['partial_results'] = partial_results + self.data_manager.save_checkpoint(checkpoint) except Exception as e: - print(f"Error categorizing with persona {persona['name']}: {e}") - categorizations.append({ + print(f"Error categorizing {repo_name} with persona {persona['name']}: {e}") + cat_entry = { 'persona_name': persona['name'], 'category': 'UNCATEGORIZED', 'reason': f'Error: {str(e)}', 'timestamp': datetime.datetime.now().isoformat() - }) + } + categorizations.append(cat_entry) + + # Update checkpoint + if 'categorizations' not in partial: + partial['categorizations'] = [] + partial['categorizations'].append(cat_entry) + partial['personas_completed'].append(persona['name']) + partial_results[repo_id] = partial + checkpoint['partial_results'] = partial_results + self.data_manager.save_checkpoint(checkpoint) + + # Add a small delay to avoid rate limiting + time.sleep(0.1) else: # If README is empty or error, mark all categorizations as UNCATEGORIZED - for persona in tqdm(personas, desc=f"Marking {row.get('repo_artifact_name', 'repo')} as UNCATEGORIZED", leave=False): - categorizations.append({ + for persona in tqdm(personas, desc=f"Marking {repo_name} as UNCATEGORIZED", leave=False): + # Skip if already categorized by this persona + if persona['name'] in partial.get('personas_completed', []): + # Use existing categorization from checkpoint + if 'categorizations' in partial: + for cat in partial['categorizations']: + if cat['persona_name'] == persona['name']: + categorizations.append(cat) + break + continue + + cat_entry = { 'persona_name': persona['name'], 'category': 'UNCATEGORIZED', - 'reason': f'README {readme_status}', + 'reason': f'README {repo_data["readme_status"]}', 'timestamp': datetime.datetime.now().isoformat() - }) + } + categorizations.append(cat_entry) + + # Update checkpoint + if 'categorizations' not in partial: + partial['categorizations'] = [] + partial['categorizations'].append(cat_entry) + partial['personas_completed'].append(persona['name']) + partial_results[repo_id] = partial + checkpoint['partial_results'] = partial_results + self.data_manager.save_checkpoint(checkpoint) # Determine final recommendation based on categorizations - final_recommendation = self._determine_final_recommendation(categorizations, row.get('star_count', 0)) + final_recommendation = self._determine_final_recommendation(categorizations, repo_data.get('star_count', 0)) - # Update the row with processed data - batch_df.at[idx, 'summary'] = summary - batch_df.at[idx, 'categorizations'] = categorizations - batch_df.at[idx, 'final_recommendation'] = final_recommendation - batch_df.at[idx, 'processing_timestamp'] = datetime.datetime.now().isoformat() - - all_processed_data.append(batch_df) + # Update the repository data + repo_data['categorizations'] = categorizations + repo_data['final_recommendation'] = final_recommendation + repo_data['processing_timestamp'] = datetime.datetime.now().isoformat() + + # Create a DataFrame for this repository + repo_df = pd.DataFrame([repo_data]) + + # Save this repository to the unified data + self.data_manager.append_unified_data(repo_df) + + # Mark as fully processed + processed_repos.add(repo_id) + checkpoint['processed_repos'] = list(processed_repos) + checkpoint['last_processed_repo_id'] = repo_id + + # Remove from partial results to save space + if repo_id in partial_results: + del partial_results[repo_id] + + checkpoint['partial_results'] = partial_results + self.data_manager.save_checkpoint(checkpoint) + + # Add to processed data + all_processed_data.append(repo_df) if not all_processed_data: print("No data was processed.") return pd.DataFrame() - return pd.concat(all_processed_data, ignore_index=True) + return pd.concat(all_processed_data, ignore_index=True) if all_processed_data else pd.DataFrame() def _determine_final_recommendation(self, categorizations: List[Dict[str, Any]], star_count: int) -> str: """ diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/fetcher.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/fetcher.py index 68b294c7..36bf25ac 100644 --- a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/fetcher.py +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/fetcher.py @@ -17,11 +17,19 @@ def fetch_repositories(self, limit: int = None, sort_by_stars: bool = True) -> p limit: Optional limit on number of repositories to fetch. sort_by_stars: If True, sort repositories by star_count descending. """ - query = """ + + where_keywords = """ + collection_name LIKE '%ethereum%' + OR collection_name LIKE '%arbitrum%' + OR collection_name LIKE '%optimism%' + OR collection_name LIKE '%scroll%' + OR collection_name LIKE '%polygon%' + """ + query = f""" SELECT DISTINCT re.artifact_id AS repo_artifact_id, p.project_id, - p.project_name AS atlas_id, + p.project_name, p.display_name, re.artifact_namespace AS repo_artifact_namespace, re.artifact_name AS repo_artifact_name, @@ -30,16 +38,13 @@ def fetch_repositories(self, limit: int = None, sort_by_stars: bool = True) -> p re.star_count, re.fork_count, re.is_fork, - re.num_packages_in_deps_dev - FROM stg_op_atlas_application AS a - JOIN projects_v1 AS p - ON p.project_id = a.project_id - JOIN stg_op_atlas_project_repository AS pr - ON p.project_id = pr.project_id - JOIN int_repositories_enriched AS re - ON re.artifact_namespace = pr.artifact_namespace - AND re.artifact_name = pr.artifact_name - WHERE a.round_id = '7' + re.num_packages_in_deps_dev + FROM int_repositories_enriched AS re + JOIN projects_v1 AS p ON re.project_id = p.project_id + WHERE p.project_id IN ( + SELECT DISTINCT project_id FROM oso.projects_by_collection_v1 + WHERE {where_keywords} + ) """ # The table int_superchain_s7_devtooling_repositories should have star_count # If not, this sort will fail or do nothing. Assuming 'r.star_count' is valid. diff --git a/experiments/ethereum-repo-clusters/pipeline_config.json b/experiments/ethereum-repo-clusters/pipeline_config.json index e116b26b..65dc68d3 100644 --- a/experiments/ethereum-repo-clusters/pipeline_config.json +++ b/experiments/ethereum-repo-clusters/pipeline_config.json @@ -3,7 +3,7 @@ "gemini_model": "gemini-2.0-flash", "summary_prompt_template": "You are an analyst preparing short, neutral briefs on open-source projects. Read the README below and write a **concise, 2- to 3-sentence summary** that:\n\u2022 states the project\u2019s core purpose / problem it solves\n\u2022 lists its main capabilities or components (1\u20133 key points only)\n\u2022 mentions the primary intended users or systems (e.g., smart-contract developers, node operators)\n\u2022 notes any strongly signalled context such as supported programming language, network, or runtime\n\n**Style constraints**\n\u2022 Use plain, factual language in third person (no hype, no marketing adjectives).\n\u2022 **Do not** guess or invent details that are not explicit in the README.\n\u2022 **Do not** label the project with, or copy wording from, the taxonomy below (to avoid category leakage).\n\u2022 Limit the summary to <100 words; avoid bullet lists or line breaks.\n\nReturn your answer as **exactly one valid JSON object** in this form (nothing extra):\n{{\n \"summary\": \"your summary here\"\n}}\n\nREADME:\n{readme_md}", "tags_prompt_template": "Based on this project summary, generate a list of relevant tags that describe the project's purpose and functionality.\n\nYou must respond with a valid JSON object in this exact format:\n{{\n \"tags\": [\"tag1\", \"tag2\", \"tag3\"]\n}}\n\nSummary:\n{summary}", - "test_mode": true, + "test_mode": false, "test_mode_limit": 30, "batch_size_summaries": 10, "batch_size_categorization": 10