diff --git a/experiments/ethereum-repo-clusters/.gitignore b/experiments/ethereum-repo-clusters/.gitignore new file mode 100644 index 00000000..58ae5f3c --- /dev/null +++ b/experiments/ethereum-repo-clusters/.gitignore @@ -0,0 +1,38 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Environment +.env +.venv +env/ +venv/ +ENV/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo + +# Project specific +data/ +output/ \ No newline at end of file diff --git a/experiments/ethereum-repo-clusters/CategorySummary.ipynb b/experiments/ethereum-repo-clusters/CategorySummary.ipynb new file mode 100644 index 00000000..9ca79d6b --- /dev/null +++ b/experiments/ethereum-repo-clusters/CategorySummary.ipynb @@ -0,0 +1,531 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "9c0861ae-d89b-4f21-a743-f5a77efa7648", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "add99052-fecf-4130-9cc5-b7413c643864", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
repo_artifact_idproject_idproject_namedisplay_namerepo_artifact_namespacerepo_artifact_namecreated_atupdated_atstar_countfork_count...is_actively_maintainedfinal_recommendationprocessing_timestampsummaryreadme_statusprotocol_architectecosystem_analystsecurity_researcheruser_experience_advocategovernance_specialist
0jXXy/fnXRva/c1jf/Weav9O3pWDHf/lVArjj0/oteUM=KLkMfahLmIEtzAbihkJ4U9p0e/3zWn0iN6xBrwN++lU=ethereum-attestation-serviceEthereum Attestation Serviceethereum-attestation-serviceeas-docs-site2022-11-09 19:39:56.000 UTC2025-06-02 15:51:08.000 UTC1739...TrueDeveloper Experience Tools2025-06-06T00:55:35.737447The project provides documentation for the Eth...SUCCESSDeveloper Experience ToolsDeveloper Experience ToolsDeveloper Experience ToolsDeveloper Experience ToolsApplication-Specific & Niche Tools
1Ymt6ZmVh75JL7ml3IM9hU32qFd+GB84kXijLttRFS+w=4JnmAZ2ikJpcjoRMX+2ZLxAQ10don2FQd6yPyDaBZ20=erigontecherigontecherigontechgmp-wasm2020-12-16 08:27:02.000 UTC2025-03-24 16:40:59.000 UTC174...TrueCryptography & Primitives2025-06-06T00:55:27.645719The GNU MP Library provides arbitrary precisio...SUCCESSCryptography & PrimitivesCryptography & PrimitivesCryptography & PrimitivesCryptography & PrimitivesCryptography & Primitives
2sTL/I78T3P6uyVN+En480uSHiTXT7UdHPmKQlvWxCkc=4JnmAZ2ikJpcjoRMX+2ZLxAQ10don2FQd6yPyDaBZ20=erigontecherigontecherigontechdiagnostics2023-02-22 11:05:42.000 UTC2025-04-25 07:42:52.000 UTC1721...TrueInfrastructure & Node Operations2025-06-06T00:55:20.010674The Erigon Diagnostics System is a web applica...SUCCESSInfrastructure & Node OperationsInfrastructure & Node OperationsDeFi Security & MonitoringInfrastructure & Node OperationsInfrastructure & Node Operations
39C23r6x0hqtbR/lB/1nYpc5KVgCtm4ga+lOJa4gd2cY=Fs/BFdYMfeuzzzWPOX3dtOA6Z4AOJsB2eO2JIoZEzUo=ensdomainsENSensdomainscourt2018-05-02 19:41:02.000 UTC2025-05-20 03:41:25.000 UTC177...TrueApplication-Specific & Niche Tools2025-06-06T00:55:11.604116Court provides smart contracts for arbitrating...SUCCESSApplication-Specific & Niche ToolsApplication-Specific & Niche ToolsDeFi Security & MonitoringApplication-Specific & Niche ToolsGovernance & DAO Tooling
4j9aT6b4e9dCsCbJ42JXen90EHik4VhyLFvX2RjeiJGM=Fs/BFdYMfeuzzzWPOX3dtOA6Z4AOJsB2eO2JIoZEzUo=ensdomainsENSensdomainsop-resolver2022-11-03 11:14:36.000 UTC2025-05-20 03:21:33.000 UTC176...TrueInteroperability & Cross-chain2025-06-06T00:55:03.917944The Optimism Resolver project facilitates stor...SUCCESSInfrastructure & Node OperationsInteroperability & Cross-chainInteroperability & Cross-chainUser Interface & Integration SDKsGovernance & DAO Tooling
..................................................................
5234AcuAtRmOCfY1rQN0rAx8iP5pdbgveBahZYSWK2leQq4=AmxsQKHnsygqA+a7WJawinHjVclh84R+edks3EL9jiM=fuellabsFuel Networkfuellabsfuels-rs2021-10-31 22:33:54.000 UTC2025-06-03 17:34:29.000 UTC437471355...TrueDevelopment Frameworks2025-06-05T14:24:14.479181The fuels-rs project provides a Rust SDK for t...SUCCESSCore Protocol InterfacesDevelopment FrameworksDevelopment FrameworksDevelopment FrameworksDevelopment Frameworks
5235JfvNeHojsqThZKXGfbrSSW4JIf2db88eIku67txzj9w=vD6QgU2nKpWiutcCnblDJkVHtDkLDH6oyITV+xpe3+g=go-ethereumgethethereumgo-ethereum2013-12-26 13:05:46.000 UTC2025-06-03 16:54:54.000 UTC4906520888...TrueInfrastructure & Node Operations2025-06-05T14:24:08.096520Go Ethereum (geth) is a Golang implementation ...SUCCESSInfrastructure & Node OperationsInfrastructure & Node OperationsInfrastructure & Node OperationsInfrastructure & Node OperationsInfrastructure & Node Operations
5236imBvQgAogfFYL0+hque3sUxe+dN53nsDQFoz1q1jgDA=AmxsQKHnsygqA+a7WJawinHjVclh84R+edks3EL9jiM=fuellabsFuel Networkfuellabsfuel-core2020-08-27 21:12:14.000 UTC2025-06-03 17:34:30.000 UTC576372852...TrueInfrastructure & Node Operations2025-06-05T14:24:01.176979The Fuel client implements a Fuel node, provid...SUCCESSInfrastructure & Node OperationsInfrastructure & Node OperationsInfrastructure & Node OperationsInfrastructure & Node OperationsInfrastructure & Node Operations
5237XK2KsRrMXU8N9WUAXb0V+X2pZWgx9H2UCtaJ6IONUC4=AmxsQKHnsygqA+a7WJawinHjVclh84R+edks3EL9jiM=fuellabsFuel Networkfuellabssway2021-01-19 20:54:33.000 UTC2025-06-03 17:34:31.000 UTC622555405...TrueLanguage & Compilation Tools2025-06-05T14:23:54.181337Sway is a programming language designed for th...SUCCESSLanguage & Compilation ToolsLanguage & Compilation ToolsLanguage & Compilation ToolsLanguage & Compilation ToolsLanguage & Compilation Tools
5238ACDSfw399At2CyBKEzgNCwOZ3zvC990eWZjGw+Z8isA=cJt1yXO/geeLxyt++Pe5iU+kUyklaoGot3rHqrDNk1o=base-orgBasebase-orgnode2023-02-01 13:55:02.000 UTC2025-02-10 01:22:12.000 UTC685682635...TrueInfrastructure & Node Operations2025-06-05T14:23:47.813647The Base Node project provides Docker configur...SUCCESSInfrastructure & Node OperationsInfrastructure & Node OperationsInfrastructure & Node OperationsInfrastructure & Node OperationsInfrastructure & Node Operations
\n", + "

5239 rows × 22 columns

\n", + "
" + ], + "text/plain": [ + " repo_artifact_id \\\n", + "0 jXXy/fnXRva/c1jf/Weav9O3pWDHf/lVArjj0/oteUM= \n", + "1 Ymt6ZmVh75JL7ml3IM9hU32qFd+GB84kXijLttRFS+w= \n", + "2 sTL/I78T3P6uyVN+En480uSHiTXT7UdHPmKQlvWxCkc= \n", + "3 9C23r6x0hqtbR/lB/1nYpc5KVgCtm4ga+lOJa4gd2cY= \n", + "4 j9aT6b4e9dCsCbJ42JXen90EHik4VhyLFvX2RjeiJGM= \n", + "... ... \n", + "5234 AcuAtRmOCfY1rQN0rAx8iP5pdbgveBahZYSWK2leQq4= \n", + "5235 JfvNeHojsqThZKXGfbrSSW4JIf2db88eIku67txzj9w= \n", + "5236 imBvQgAogfFYL0+hque3sUxe+dN53nsDQFoz1q1jgDA= \n", + "5237 XK2KsRrMXU8N9WUAXb0V+X2pZWgx9H2UCtaJ6IONUC4= \n", + "5238 ACDSfw399At2CyBKEzgNCwOZ3zvC990eWZjGw+Z8isA= \n", + "\n", + " project_id \\\n", + "0 KLkMfahLmIEtzAbihkJ4U9p0e/3zWn0iN6xBrwN++lU= \n", + "1 4JnmAZ2ikJpcjoRMX+2ZLxAQ10don2FQd6yPyDaBZ20= \n", + "2 4JnmAZ2ikJpcjoRMX+2ZLxAQ10don2FQd6yPyDaBZ20= \n", + "3 Fs/BFdYMfeuzzzWPOX3dtOA6Z4AOJsB2eO2JIoZEzUo= \n", + "4 Fs/BFdYMfeuzzzWPOX3dtOA6Z4AOJsB2eO2JIoZEzUo= \n", + "... ... \n", + "5234 AmxsQKHnsygqA+a7WJawinHjVclh84R+edks3EL9jiM= \n", + "5235 vD6QgU2nKpWiutcCnblDJkVHtDkLDH6oyITV+xpe3+g= \n", + "5236 AmxsQKHnsygqA+a7WJawinHjVclh84R+edks3EL9jiM= \n", + "5237 AmxsQKHnsygqA+a7WJawinHjVclh84R+edks3EL9jiM= \n", + "5238 cJt1yXO/geeLxyt++Pe5iU+kUyklaoGot3rHqrDNk1o= \n", + "\n", + " project_name display_name \\\n", + "0 ethereum-attestation-service Ethereum Attestation Service \n", + "1 erigontech erigontech \n", + "2 erigontech erigontech \n", + "3 ensdomains ENS \n", + "4 ensdomains ENS \n", + "... ... ... \n", + "5234 fuellabs Fuel Network \n", + "5235 go-ethereum geth \n", + "5236 fuellabs Fuel Network \n", + "5237 fuellabs Fuel Network \n", + "5238 base-org Base \n", + "\n", + " repo_artifact_namespace repo_artifact_name \\\n", + "0 ethereum-attestation-service eas-docs-site \n", + "1 erigontech gmp-wasm \n", + "2 erigontech diagnostics \n", + "3 ensdomains court \n", + "4 ensdomains op-resolver \n", + "... ... ... \n", + "5234 fuellabs fuels-rs \n", + "5235 ethereum go-ethereum \n", + "5236 fuellabs fuel-core \n", + "5237 fuellabs sway \n", + "5238 base-org node \n", + "\n", + " created_at updated_at star_count \\\n", + "0 2022-11-09 19:39:56.000 UTC 2025-06-02 15:51:08.000 UTC 17 \n", + "1 2020-12-16 08:27:02.000 UTC 2025-03-24 16:40:59.000 UTC 17 \n", + "2 2023-02-22 11:05:42.000 UTC 2025-04-25 07:42:52.000 UTC 17 \n", + "3 2018-05-02 19:41:02.000 UTC 2025-05-20 03:41:25.000 UTC 17 \n", + "4 2022-11-03 11:14:36.000 UTC 2025-05-20 03:21:33.000 UTC 17 \n", + "... ... ... ... \n", + "5234 2021-10-31 22:33:54.000 UTC 2025-06-03 17:34:29.000 UTC 43747 \n", + "5235 2013-12-26 13:05:46.000 UTC 2025-06-03 16:54:54.000 UTC 49065 \n", + "5236 2020-08-27 21:12:14.000 UTC 2025-06-03 17:34:30.000 UTC 57637 \n", + "5237 2021-01-19 20:54:33.000 UTC 2025-06-03 17:34:31.000 UTC 62255 \n", + "5238 2023-02-01 13:55:02.000 UTC 2025-02-10 01:22:12.000 UTC 68568 \n", + "\n", + " fork_count ... is_actively_maintained \\\n", + "0 39 ... True \n", + "1 4 ... True \n", + "2 21 ... True \n", + "3 7 ... True \n", + "4 6 ... True \n", + "... ... ... ... \n", + "5234 1355 ... True \n", + "5235 20888 ... True \n", + "5236 2852 ... True \n", + "5237 5405 ... True \n", + "5238 2635 ... True \n", + "\n", + " final_recommendation processing_timestamp \\\n", + "0 Developer Experience Tools 2025-06-06T00:55:35.737447 \n", + "1 Cryptography & Primitives 2025-06-06T00:55:27.645719 \n", + "2 Infrastructure & Node Operations 2025-06-06T00:55:20.010674 \n", + "3 Application-Specific & Niche Tools 2025-06-06T00:55:11.604116 \n", + "4 Interoperability & Cross-chain 2025-06-06T00:55:03.917944 \n", + "... ... ... \n", + "5234 Development Frameworks 2025-06-05T14:24:14.479181 \n", + "5235 Infrastructure & Node Operations 2025-06-05T14:24:08.096520 \n", + "5236 Infrastructure & Node Operations 2025-06-05T14:24:01.176979 \n", + "5237 Language & Compilation Tools 2025-06-05T14:23:54.181337 \n", + "5238 Infrastructure & Node Operations 2025-06-05T14:23:47.813647 \n", + "\n", + " summary readme_status \\\n", + "0 The project provides documentation for the Eth... SUCCESS \n", + "1 The GNU MP Library provides arbitrary precisio... SUCCESS \n", + "2 The Erigon Diagnostics System is a web applica... SUCCESS \n", + "3 Court provides smart contracts for arbitrating... SUCCESS \n", + "4 The Optimism Resolver project facilitates stor... SUCCESS \n", + "... ... ... \n", + "5234 The fuels-rs project provides a Rust SDK for t... SUCCESS \n", + "5235 Go Ethereum (geth) is a Golang implementation ... SUCCESS \n", + "5236 The Fuel client implements a Fuel node, provid... SUCCESS \n", + "5237 Sway is a programming language designed for th... SUCCESS \n", + "5238 The Base Node project provides Docker configur... SUCCESS \n", + "\n", + " protocol_architect ecosystem_analyst \\\n", + "0 Developer Experience Tools Developer Experience Tools \n", + "1 Cryptography & Primitives Cryptography & Primitives \n", + "2 Infrastructure & Node Operations Infrastructure & Node Operations \n", + "3 Application-Specific & Niche Tools Application-Specific & Niche Tools \n", + "4 Infrastructure & Node Operations Interoperability & Cross-chain \n", + "... ... ... \n", + "5234 Core Protocol Interfaces Development Frameworks \n", + "5235 Infrastructure & Node Operations Infrastructure & Node Operations \n", + "5236 Infrastructure & Node Operations Infrastructure & Node Operations \n", + "5237 Language & Compilation Tools Language & Compilation Tools \n", + "5238 Infrastructure & Node Operations Infrastructure & Node Operations \n", + "\n", + " security_researcher user_experience_advocate \\\n", + "0 Developer Experience Tools Developer Experience Tools \n", + "1 Cryptography & Primitives Cryptography & Primitives \n", + "2 DeFi Security & Monitoring Infrastructure & Node Operations \n", + "3 DeFi Security & Monitoring Application-Specific & Niche Tools \n", + "4 Interoperability & Cross-chain User Interface & Integration SDKs \n", + "... ... ... \n", + "5234 Development Frameworks Development Frameworks \n", + "5235 Infrastructure & Node Operations Infrastructure & Node Operations \n", + "5236 Infrastructure & Node Operations Infrastructure & Node Operations \n", + "5237 Language & Compilation Tools Language & Compilation Tools \n", + "5238 Infrastructure & Node Operations Infrastructure & Node Operations \n", + "\n", + " governance_specialist \n", + "0 Application-Specific & Niche Tools \n", + "1 Cryptography & Primitives \n", + "2 Infrastructure & Node Operations \n", + "3 Governance & DAO Tooling \n", + "4 Governance & DAO Tooling \n", + "... ... \n", + "5234 Development Frameworks \n", + "5235 Infrastructure & Node Operations \n", + "5236 Infrastructure & Node Operations \n", + "5237 Language & Compilation Tools \n", + "5238 Infrastructure & Node Operations \n", + "\n", + "[5239 rows x 22 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_parquet('output/ethereum_repos_unified.parquet')\n", + "df[\"categorizations_list\"] = df[\"categorizations_json\"].apply(json.loads)\n", + "\n", + "def persona_to_category_map(cats_list):\n", + " return { d[\"persona_name\"]: d[\"category\"] for d in cats_list }\n", + "df_persona_map = df[\"categorizations_list\"].apply(persona_to_category_map)\n", + "df_persona_cols = pd.json_normalize(df_persona_map)\n", + "\n", + "df = df.join(df_persona_cols)\n", + "df = df.drop(columns=[\"categorizations_list\", \"categorizations_json\", \"readme_md\"])\n", + "#df.to_csv('categorizations.csv')\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "368d46ba-cedc-455d-a246-925b6c996090", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/experiments/ethereum-repo-clusters/README.md b/experiments/ethereum-repo-clusters/README.md new file mode 100644 index 00000000..5f5db141 --- /dev/null +++ b/experiments/ethereum-repo-clusters/README.md @@ -0,0 +1,402 @@ +# Ethereum Repo Clusters + +A Python package for automatically clustering Ethereum development tools and libraries based on their README content using AI-driven analysis and multiple personas. + +## Overview + +This project implements a pipeline to: +1. Fetch repository data from the OSO (Open Source Observer) database. +2. Retrieve corresponding README files from GitHub. +3. Generate concise project summaries using Google's Gemini AI. +4. Employ multiple configurable AI personas to categorize each project based on its summary and metadata. +5. Consolidate these categorizations, using a star-count weighted approach for projects with multiple repositories, to produce a final recommended category. + +The entire process is managed via a Command Line Interface (CLI). + +## Features + +- Fetches comprehensive repository data via OSO, including fork status and activity tracking. +- Retrieves and processes README.md files from GitHub with robust error handling. +- Utilizes Google's Gemini AI for intelligent summary generation. +- Employs a multi-persona approach for nuanced project categorization. +- Supports an arbitrary number of configurable AI personas. +- Calculates final project recommendations using star-count weighted consolidation. +- Offers both modular pipeline and unified processing approaches. +- Provides detailed tracking of repository status (active/inactive, fork/non-fork). +- Handles empty or error READMEs gracefully with "UNCATEGORIZED" status. +- Includes timestamps for all categorization operations. +- Test mode for quick runs on a subset of data. +- Outputs data at various stages in Parquet and CSV formats (with README text removed from CSV for readability). +- Supports easy resumption of processing and addition of new repositories. +- Features comprehensive progress bars at multiple levels for better visibility into processing status. +- **Checkpoint System**: Automatically saves progress after each step, allowing for seamless recovery from interruptions. +- **Incremental Saving**: Saves results after processing each repository, ensuring no work is lost if the process is interrupted. +- **Resume Capability**: Automatically detects partially processed repositories and continues from where it left off. + +## Prerequisites + +- Python 3.10+ +- Access to OSO, GitHub, and Google Gemini APIs. + +## Installation + +1. **Clone the repository:** + ```bash + git clone + cd ethereum-repo-clusters + ``` + +2. **Set up a virtual environment (recommended):** + ```bash + python -m venv venv + source venv/bin/activate # On Windows use `venv\Scripts\activate` + ``` + +3. **Install dependencies:** + ```bash + pip install -r requirements.txt + ``` + +4. **Install the package in editable mode (optional, for development):** + ```bash + pip install -e . + ``` + +5. **Create a `.env` file** in the project root directory (`ethereum-repo-clusters/`) and add your API keys: + ```env + OSO_API_KEY="your_oso_api_key" + GITHUB_TOKEN="your_github_token" # A GitHub Personal Access Token with repo access + GEMINI_API_KEY="your_gemini_api_key" + ``` + These keys are loaded via `ethereum-repo-clusters/config/settings.py`. + +## Configuration + +The project uses a combination of a JSON configuration file and Python modules for settings: + +- **`pipeline_config.json`**: + - Located at the project root. + - Controls operational settings like `output_dir`, `test_mode`, `test_mode_limit`, AI model name (`gemini_model`), and batch sizes for AI processing. + - If this file is missing, it will be automatically created with default values on the first run. + - Values in this file override defaults sourced from Python modules. + +- **AI Personas (`ethereum-repo-clusters/config/prompts/personas.py`):** + - Define the different AI personas used for categorization. + - Each persona is a dictionary with `name`, `title`, `description`, and a `prompt` template. + - Modify this Python list directly to add, remove, or change personas. + +- **Categories (`ethereum-repo-clusters/config/prompts/categories.py`):** + - Defines the list of possible categories projects can be assigned to. + - Includes `CATEGORIES` (list of dicts with `category` and `description`) and `CATEGORY_NAMES` (a simple list of category names). + - Edit this file to update the categorization taxonomy. + +- **Prompt Templates (`ethereum-repo-clusters/config/prompts/summary_prompts.py`):** + - Contains `SUMMARY_PROMPT` (for generating project summaries) and `TAGS_PROMPT` (for an auxiliary tag generation, currently not central to categorization). + - These are used by the `AIService`. + +- **Core Settings (`ethereum-repo-clusters/config/settings.py`):** + - Loads API keys from the `.env` file. + - Defines default values for `GEMINI_MODEL` and `OUTPUT_DIR` if not specified in `pipeline_config.json`. + +## Usage (CLI) + +The project is operated via the command line using `python -m ethereum-repo-clusters`. + +**General Command Structure:** +```bash +python -m ethereum-repo-clusters [GLOBAL_OPTIONS] COMMAND [COMMAND_OPTIONS] +``` + +**Global Options:** +- `--test-mode`: Runs the specified command(s) in test mode, processing a limited number of repositories (defined by `test_mode_limit` in `pipeline_config.json`, sorted by stars). + +**Main Commands:** + +- **`fetch_repos`**: Fetches repository data from OSO and READMEs from GitHub. + ```bash + python -m ethereum-repo-clusters fetch_repos + ``` + - `--force-refresh`: Wipes existing raw repository data and re-fetches. + - `--fetch-new-only`: Only fetches repositories that don't exist in current data. + +- **`generate_summaries`**: Generates AI summaries for fetched repositories. + ```bash + python -m ethereum-repo-clusters generate_summaries + ``` + - `--force-refresh`: Wipes existing summaries and regenerates them. + - `--new-only`: Only generates summaries for repositories that don't have summaries yet. + +- **`categorize`**: Categorizes projects using all defined AI personas. + ```bash + python -m ethereum-repo-clusters categorize + ``` + - `--force-refresh`: Wipes existing categorizations and re-runs. + - `--persona `: Processes only the specified persona. Can be combined with `--force-refresh`. Example: + ```bash + python -m ethereum-repo-clusters categorize --persona keyword_spotter --force-refresh + ``` + - `--new-only`: Only categorizes repositories that don't have categories yet. + +- **`consolidate`**: Consolidates categorizations from all personas and generates final project recommendations. + ```bash + python -m ethereum-repo-clusters consolidate + ``` + *(This step does not typically require a force-refresh as it always processes the latest categorized data.)* + +**Persona Management (Informational):** +The CLI includes commands related to personas, but due to refactoring, persona definitions are now managed directly in `ethereum-repo-clusters/config/prompts/personas.py`. These CLI commands are informational: + +- `python -m ethereum-repo-clusters personas list`: Lists personas currently defined in `personas.py`. +- `python -m ethereum-repo-clusters personas add ...`: Provides instructions on how to add a persona by editing `personas.py`. +- `python -m ethereum-repo-clusters personas remove `: Provides instructions on how to remove a persona by editing `personas.py`. + +**Example Full Run in Test Mode with Full Refresh:** +```bash +# Legacy pipeline approach +python -m ethereum-repo-clusters --test-mode run_all --force-refresh-all + +# New unified processor approach (recommended) +python -m ethereum-repo-clusters --test-mode run_all --force-refresh-all --use-unified +``` + +## Workflow + +### Legacy Pipeline (Step-by-Step) + +1. **Fetch Data (`fetch_repos`):** + - Repository metadata is fetched from OSO. + - README.md content is fetched from GitHub for these repositories. + - Output: `output/devtooling_raw.parquet` + +2. **Generate Summaries (`generate_summaries`):** + - READMEs are processed by Gemini AI to create concise summaries. + - Output: `output/devtooling_summarized.parquet` + +3. **Categorize by Persona (`categorize`):** + - Each project summary (with metadata) is evaluated by every defined AI persona. + - Each persona assigns a category based on its specific prompt and the global category list. + - Output: Individual Parquet files per persona in `output/categorized/` (e.g., `output/categorized/keyword_spotter.parquet`). + +4. **Consolidate Recommendations (`consolidate`):** + - Categorizations from all personas are merged. + - For each project: + - If it's a single-repository project, the recommendation is based on a star-weighted aggregation of persona assignments for that repo. + - If it's a multi-repository project, the recommendation is determined by a star-count weighted aggregation of all persona assignments across all its repositories. The category with the highest total star weight wins. + - Output: `output/devtooling_full.parquet` and `output/devtooling_consolidated.csv`. + +### New Unified Processor (Recommended) + +The new unified processor combines all steps into a single efficient pipeline: + +1. **Process Repositories (`process_unified`):** + - Repository metadata is fetched from OSO, including fork status and activity tracking. + - README.md content is fetched from GitHub with robust error handling. + - For each repository with a valid README: + - A summary is generated immediately. + - All personas categorize the repository in sequence. + - Results are stored with timestamps for each operation. + - For repositories with empty or error READMEs: + - Status is tracked as "EMPTY" or "ERROR". + - All categorizations are marked as "UNCATEGORIZED". + - A final recommendation is determined based on the most common category across personas. + - Output: `output/ethereum_repos_unified.parquet` and `output/ethereum_repos_unified.csv`. + +The unified processor offers several advantages: +- Single pass through repositories (more efficient) +- Better error handling and status tracking +- Easier to resume processing or add new repositories +- Comprehensive data structure with all information in one place +- Timestamps for all operations for better traceability +- Detailed progress bars for tracking processing status at multiple levels +- CSV output with README text removed for improved readability +- Checkpoint system that saves progress after each step +- Incremental saving that preserves work even if interrupted +- Automatic resume capability that continues from where it left off + +## Output Files + +All output data is stored in the directory specified by `output_dir` in `pipeline_config.json` (default is `output/`). + +### Legacy Pipeline Output + +- **`devtooling_raw.parquet`**: Raw data fetched from OSO, augmented with GitHub README content. +- **`devtooling_summarized.parquet`**: Repositories with their AI-generated summaries. +- **`categorized/.parquet`**: Dataframe for each persona, containing the original summary data plus that persona's assigned category and reason. +- **`devtooling_full.parquet`**: The final consolidated dataset, with one row per project, including the overall recommendation, total stars, repo count, sample summary, and individual persona category modes. +- **`devtooling_consolidated.csv`**: A CSV version of the final consolidated data for easier viewing. + +### Unified Processor Output + +- **`ethereum_repos_unified.parquet`**: Comprehensive dataset containing all repositories with their metadata, summaries, and categorizations in a single structure. +- **`ethereum_repos_unified.csv`**: A CSV version of the unified data for easier viewing, with README text removed and long text fields truncated for readability. +- **`processing_checkpoint.json`**: Checkpoint file that tracks processing progress, allowing for seamless recovery from interruptions. + +### Unified Data Structure + +The unified processor creates a comprehensive data structure with the following key fields: + +```json +{ + "repo_artifact_id": "...", + "project_id": "...", + "repo_artifact_namespace": "...", + "repo_artifact_name": "...", + "is_fork": true/false, + "is_actively_maintained": true/false, + "last_updated": "2024-12-01", + "star_count": 100, + "readme_status": "SUCCESS/EMPTY/ERROR", + "summary": "...", + "categorizations": [ + { + "persona_name": "keyword_spotter", + "category": "Developer Tools", + "reason": "Contains keywords like 'CLI', 'build tool'...", + "timestamp": "2025-01-05T09:15:00Z" + }, + { + "persona_name": "senior_strategist", + "category": "Infrastructure", + "reason": "Mature project with strong adoption...", + "timestamp": "2025-01-05T09:15:01Z" + }, + { + "persona_name": "workflow_wizard", + "category": "Developer Tools", + "reason": "Streamlines development workflow...", + "timestamp": "2025-01-05T09:15:02Z" + } + ], + "final_recommendation": "Developer Tools", + "processing_timestamp": "2025-01-05T09:15:02Z" +} +``` + +This structure makes it easy to: +- Track which repositories have been processed +- Identify repositories with errors or empty READMEs +- See the categorization from each persona with timestamps +- Filter repositories by fork status or activity +- Resume processing from where you left off + +## Development Notes +- The project uses `tqdm` for progress bars during long operations, with detailed progress tracking at multiple levels: + - Overall batch processing + - Repository processing within each batch + - README fetching for each repository + - Categorization with each persona +- `DataManager` class in `ethereum-repo-clusters/pipeline/data_manager.py` handles all data persistence (reading/writing Parquet files). +- `AIService` in `ethereum-repo-clusters/processing/ai_service.py` abstracts interactions with the Gemini API. +- `UnifiedProcessor` in `ethereum-repo-clusters/pipeline/unified_processor.py` provides the new streamlined processing approach. +- The CLI in `ethereum-repo-clusters/cli/main_cli.py` supports both legacy and unified processing approaches. +- Output files are saved to the local `output/` directory in the current repository. + +## New CLI Commands + +### Unified Processing + +```bash +# Process repositories with the unified processor +python -m ethereum-repo-clusters process_unified [OPTIONS] + +# Options: +# --force-refresh Force refresh all data, ignoring existing. +# --include-forks Include forked repositories in processing. +# --include-inactive Include repositories not updated in the last year. +# --limit INTEGER Limit the number of repositories to process. +``` + +### Run All with Unified Processor + +```bash +# Run the entire pipeline using the unified processor +python -m ethereum-repo-clusters run_all --use-unified [OPTIONS] + +# Additional options with --use-unified: +# --include-forks Include forked repositories in processing. +# --include-inactive Include repositories not updated in the last year. +``` + +## Adding New Repositories + +To add new repositories to the analysis: + +1. The unified processor automatically detects which repositories have already been processed. +2. New repositories from OSO will be processed automatically on the next run. +3. To add repositories manually, you can: + - Update the OSO query in `fetcher.py` to include additional repositories. + - Create a custom script that adds repositories to the unified data structure. + +## Error Handling + +The unified processor handles errors gracefully: + +- Empty READMEs: Marked with `readme_status="EMPTY"` and categorized as "UNCATEGORIZED". +- Error fetching README: Marked with `readme_status="ERROR"` and categorized as "UNCATEGORIZED". +- API errors during categorization: The specific persona's categorization is marked as "UNCATEGORIZED" with the error reason. + +This approach ensures that all repositories are included in the final output, even if they couldn't be fully processed. + +## Checkpoint System + +The unified processor now includes a robust checkpoint system that makes it resilient to interruptions: + +### How It Works + +1. **Incremental Saving**: Results are saved after processing each repository, not just at the end. +2. **Checkpoint File**: A JSON file (`output/processing_checkpoint.json`) tracks: + - Which repositories have been fully processed + - Which repositories are partially processed and their current state + - The last repository that was successfully processed + +3. **Granular Progress Tracking**: The checkpoint tracks progress at multiple levels: + - README fetching status + - Summary generation status + - Which personas have completed categorization + +4. **Resume Logic**: When restarted after an interruption, the processor: + - Skips repositories that have been fully processed + - Continues from where it left off for partially processed repositories + - Preserves all work that was completed before the interruption + +5. **Space Optimization**: Once a repository is fully processed, its partial results are removed from the checkpoint file to save space. + +### Benefits + +- **No Lost Work**: Even if interrupted during a long-running process, no work is lost. +- **API Efficiency**: Avoids redundant API calls to GitHub and Gemini, saving rate limits and costs. +- **Time Savings**: Picks up exactly where it left off, avoiding redundant processing. +- **Resilience**: Handles network issues, API timeouts, and other temporary failures gracefully. + +### Example Checkpoint Structure + +```json +{ + "last_processed_repo_id": "ethereum/solidity", + "processed_repos": ["openzeppelin/openzeppelin-contracts", "ethereum/solidity"], + "partial_results": { + "ipfs/kubo": { + "readme_fetched": true, + "readme_status": "SUCCESS", + "summary_generated": true, + "personas_completed": ["protocol_architect", "ecosystem_analyst"], + "categorizations": [ + { + "persona_name": "protocol_architect", + "category": "Infrastructure & Node Operations", + "reason": "...", + "timestamp": "2025-06-05T13:53:30.903574" + }, + { + "persona_name": "ecosystem_analyst", + "category": "Infrastructure & Node Operations", + "reason": "...", + "timestamp": "2025-06-05T13:53:32.238039" + } + ] + } + } +} +``` + +This checkpoint system ensures that the processing pipeline is robust and can handle interruptions gracefully, making it suitable for processing large numbers of repositories over extended periods. diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/__init__.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/__main__.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/__main__.py new file mode 100644 index 00000000..4bedcbac --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/__main__.py @@ -0,0 +1,9 @@ +from .cli.main_cli import cli + +def main(): + # The obj={} is a way to initialize Click's context object + # if it's not being run directly by the `click` runner (e.g. `python -m devtooling_labels`) + cli(obj={}) + +if __name__ == "__main__": + main() diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/cli/__init__.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/cli/__init__.py new file mode 100644 index 00000000..986cde36 --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/cli/__init__.py @@ -0,0 +1,7 @@ +# This file makes the 'cli' directory a Python package. + +from .main_cli import cli + +__all__ = [ + "cli" +] diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/cli/main_cli.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/cli/main_cli.py new file mode 100644 index 00000000..6a1a8ddd --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/cli/main_cli.py @@ -0,0 +1,243 @@ +import click +from pathlib import Path + +from ..config.config_manager import ConfigManager +from ..pipeline.data_manager import DataManager +from ..processing.ai_service import AIService +from ..pipeline.repository_fetcher import RepositoryFetcherStep +from ..pipeline.summary_generator import SummaryGeneratorStep +from ..pipeline.categorizer import CategorizerStep +from ..pipeline.consolidator import ConsolidatorStep +from ..pipeline.unified_processor import UnifiedProcessor + +# Initialize ConfigManager globally or pass as context +# For simplicity here, we'll initialize it where needed or once at the top. +# A more robust Click app might use a context object. +config_manager = ConfigManager() # Loads default or existing pipeline_config.json + +@click.group() +@click.option('--test-mode', is_flag=True, help='Run in test mode (limits fetched repos, uses test_mode_limit from config).') +@click.pass_context +def cli(ctx, test_mode): + """DevTooling Labels CLI for processing and categorizing repositories.""" + ctx.ensure_object(dict) + + # Update config if test_mode flag is set via CLI + # This overrides the value in pipeline_config.json for this run + if test_mode: + config_manager.set("test_mode", True) + # No need to save if it's a per-run override. + # If we want to persist it: config_manager.save_config() + print(f"CLI flag --test-mode is set. Running in test mode. Limit: {config_manager.get_test_mode_limit()} repos.") + else: + # If not set by CLI, respect the config file's test_mode setting + # Or, explicitly set to False if CLI should always override to False when flag not present + # config_manager.set("test_mode", False) # Uncomment if CLI flag absence means test_mode is OFF + pass # Current behavior: respects config file if CLI flag is absent. + + # Initialize services and pass them via context if needed by multiple commands + # Or initialize them within each command + output_dir = config_manager.get_output_dir() + data_manager = DataManager(output_dir=output_dir, config=config_manager) + ai_service = AIService(config_manager=config_manager) + + ctx.obj['config_manager'] = config_manager + ctx.obj['data_manager'] = data_manager + ctx.obj['ai_service'] = ai_service + ctx.obj['output_dir'] = output_dir + + +@cli.command("fetch_repos") +@click.option('--force-refresh', is_flag=True, help='Force refresh repository data, ignoring existing.') +@click.option('--fetch-new-only', is_flag=True, help='Only fetch repositories that don\'t exist in current data.') +@click.pass_context +def fetch_repos_command(ctx, force_refresh, fetch_new_only): + """Fetches repositories and their READMEs.""" + print("Executing: Fetch Repositories") + data_manager = ctx.obj['data_manager'] + # ConfigManager is already aware of test_mode from the group command + config_mgr = ctx.obj['config_manager'] + + repo_fetcher_step = RepositoryFetcherStep(data_manager=data_manager, config_manager=config_mgr) + repo_fetcher_step.run(force_refresh=force_refresh, fetch_new_only=fetch_new_only) + print("Repository fetching complete.") + + +@cli.command("generate_summaries") +@click.option('--force-refresh', is_flag=True, help='Force refresh summaries, ignoring existing.') +@click.option('--new-only', is_flag=True, help='Generate summaries only for repositories that don\'t have summaries yet.') +@click.pass_context +def generate_summaries_command(ctx, force_refresh, new_only): + """Generates summaries for the fetched repositories.""" + print("Executing: Generate Summaries") + data_manager = ctx.obj['data_manager'] + config_mgr = ctx.obj['config_manager'] + ai_service = ctx.obj['ai_service'] + + summary_generator_step = SummaryGeneratorStep( + data_manager=data_manager, + config_manager=config_mgr, + ai_service=ai_service + ) + summary_generator_step.run(force_refresh=force_refresh, new_only=new_only) + print("Summary generation complete.") + + +@cli.command("categorize") +@click.option('--force-refresh', is_flag=True, help='Force refresh categories, ignoring existing.') +@click.option('--persona', help='Process only the specified persona.') +@click.option('--new-only', is_flag=True, help='Categorize only repositories that don\'t have categories yet.') +@click.pass_context +def categorize_command(ctx, force_refresh, persona, new_only): + """Categorizes projects using AI personas.""" + print("Executing: Categorize") + data_manager = ctx.obj['data_manager'] + config_mgr = ctx.obj['config_manager'] + ai_service = ctx.obj['ai_service'] + + categorizer_step = CategorizerStep( + data_manager=data_manager, + config_manager=config_mgr, + ai_service=ai_service + ) + categorizer_step.run(force_refresh=force_refresh, target_persona_name=persona, new_only=new_only) + print("Categorization complete.") + + +@cli.command("consolidate") +@click.pass_context +def consolidate_command(ctx): + """Consolidates categorizations and generates final recommendations.""" + print("Executing: Consolidate Analysis") + data_manager = ctx.obj['data_manager'] + config_mgr = ctx.obj['config_manager'] + + consolidator_step = ConsolidatorStep(data_manager=data_manager, config_manager=config_mgr) + consolidator_step.run() + print("Consolidation complete.") + + +@cli.command("process_unified") +@click.option('--force-refresh', is_flag=True, help='Force refresh all data, ignoring existing.') +@click.option('--include-forks', is_flag=True, help='Include forked repositories in processing.') +@click.option('--include-inactive', is_flag=True, help='Include repositories not updated in the last year.') +@click.option('--limit', type=int, help='Limit the number of repositories to process.') +@click.pass_context +def process_unified_command(ctx, force_refresh, include_forks, include_inactive, limit): + """ + Unified processing: fetches repos, READMEs, generates summaries, and categorizes in one pass. + Outputs a single comprehensive dataset with all information. + """ + print("Executing: Unified Processing Pipeline") + data_manager = ctx.obj['data_manager'] + config_mgr = ctx.obj['config_manager'] + ai_service = ctx.obj['ai_service'] + + processor = UnifiedProcessor( + data_manager=data_manager, + config_manager=config_mgr, + ai_service=ai_service + ) + + processor.run( + force_refresh=force_refresh, + include_forks=include_forks, + inactive_repos=include_inactive, + limit=limit + ) + + print("Unified processing complete.") + print(f"Results saved to:") + print(f" - {data_manager.unified_parquet_path} (Parquet format)") + print(f" - {data_manager.unified_csv_path} (CSV format)") + + +@cli.command("run_all") +@click.option('--force-refresh-all', is_flag=True, help='Force refresh all data stages.') +@click.option('--force-refresh-repos', is_flag=True, help='Force refresh repository data.') +@click.option('--force-refresh-summaries', is_flag=True, help='Force refresh summaries.') +@click.option('--force-refresh-categories', is_flag=True, help='Force refresh categories.') +@click.option('--use-unified', is_flag=True, help='Use the new unified processor instead of the legacy pipeline.') +@click.option('--include-forks', is_flag=True, help='Include forked repositories (only with --use-unified).') +@click.option('--include-inactive', is_flag=True, help='Include inactive repositories (only with --use-unified).') +@click.option('--limit', type=int, help='Limit the number of repositories to process (only with --use-unified).') +@click.pass_context +def run_all_command(ctx, force_refresh_all, force_refresh_repos, force_refresh_summaries, + force_refresh_categories, use_unified, include_forks, include_inactive): + """Runs the entire pipeline: either legacy steps or the new unified processor.""" + + if use_unified: + print("Executing: Run All Using Unified Processor") + ctx.invoke( + process_unified_command, + force_refresh=force_refresh_all, + include_forks=include_forks, + include_inactive=include_inactive, + limit=limit + ) + else: + print("Executing: Run All Pipeline Steps (Legacy)") + # Determine force_refresh flags for each step + fr_repos = force_refresh_all or force_refresh_repos + fr_summaries = force_refresh_all or force_refresh_summaries + fr_categories = force_refresh_all or force_refresh_categories + + # Invoke other commands with determined force_refresh settings + # The --test-mode flag from the main group is implicitly handled by ConfigManager + ctx.invoke(fetch_repos_command, force_refresh=fr_repos) + ctx.invoke(generate_summaries_command, force_refresh=fr_summaries) + ctx.invoke(categorize_command, force_refresh=fr_categories, persona=None, new_only=False) # Process all personas + ctx.invoke(consolidate_command) + + print("Full pipeline execution complete.") + +# Commands for managing personas in config +@cli.group("personas") +def personas_group(): + """Manage AI personas in the configuration.""" + pass + +@personas_group.command("list") +@click.pass_context +def list_personas(ctx): + """Lists all configured personas.""" + config_mgr = ctx.obj['config_manager'] + personas = config_mgr.get_personas() + if not personas: + print("No personas configured.") + return + print("Configured Personas:") + for p in personas: + print(f"- Name: {p['name']}, Title: {p.get('title', 'N/A')}") + +@personas_group.command("add") +@click.option('--name', required=True, help="Unique name for the persona.") +@click.option('--title', required=True, help="Display title for the persona.") +@click.option('--description', required=True, help="Description of the persona's focus.") +@click.option('--prompt-template', required=True, help="Prompt template for the persona's classification task.") +@click.pass_context +def add_persona(ctx, name, title, description, prompt_template): + """Adds a new persona to the configuration.""" + config_mgr = ctx.obj['config_manager'] + new_persona = { + "name": name, + "title": title, + "description": description, + "prompt": prompt_template + } + # config_mgr.add_persona(new_persona) # This method was removed as personas are managed in personas.py + print(f"Persona management is now done by editing devtooling_labels/config/prompts/personas.py. '{name}' was not added via CLI.") + print("To add a persona, please edit the personas.py file directly.") + +@personas_group.command("remove") +@click.argument('name') +@click.pass_context +def remove_persona(ctx, name): + """Removes a persona by name. (Note: Persona management is now via personas.py)""" + # config_mgr = ctx.obj['config_manager'] + # config_mgr.remove_persona(name) # This method was removed from ConfigManager + print(f"Persona management is now done by editing devtooling_labels/config/prompts/personas.py. '{name}' was not removed via CLI.") + print("To remove a persona, please edit the personas.py file directly.") + +if __name__ == '__main__': + cli(obj={}) diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/__init__.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/config_manager.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/config_manager.py new file mode 100644 index 00000000..a9fb5bb1 --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/config_manager.py @@ -0,0 +1,139 @@ +import json +from pathlib import Path +from typing import List, Dict, Any +from .settings import PROJECT_ROOT, GEMINI_API_KEY, OSO_API_KEY, GITHUB_TOKEN, GEMINI_MODEL, OUTPUT_DIR +from .prompts.summary_prompts import SUMMARY_PROMPT, TAGS_PROMPT + + + +class ConfigManager: + def __init__(self, config_file_name: str = "pipeline_config.json"): + self.config_file_path = PROJECT_ROOT / config_file_name + self.config_data = self._load_config() + + def _load_config(self) -> Dict[str, Any]: + """ + Loads configuration from a JSON file, merging it with defaults. + If the file doesn't exist or is invalid, creates a default one. + Values in the JSON file override default values. + """ + default_config = self._get_default_config() + + if self.config_file_path.exists(): + with open(self.config_file_path, 'r') as f: + try: + loaded_config = json.load(f) + # Merge: loaded_config values override default_config values + merged_config = {**default_config, **loaded_config} + return merged_config + except json.JSONDecodeError: + print(f"Warning: Could not decode JSON from {self.config_file_path}. Using full default config.") + # If JSON is corrupt, return the full default config, don't save it over potentially good file yet. + # Or, we could save default_config here if we want to overwrite corrupted file. + # For now, just return defaults for this session. + return default_config + else: + print(f"Config file not found at {self.config_file_path}. Creating and using default config.") + # Save the full default config as the new file + self.save_config(default_config) + return default_config + + def _get_default_config(self) -> Dict[str, Any]: + """Returns the default configuration dictionary.""" + return { + "output_dir": str(OUTPUT_DIR), + "gemini_model": GEMINI_MODEL, + "summary_prompt_template": SUMMARY_PROMPT, + "tags_prompt_template": TAGS_PROMPT, + "test_mode": False, + "test_mode_limit": 5, + "batch_size_summaries": 50, + "batch_size_categorization": 10 # Smaller batch for categorization due to prompt complexity + } + + def save_config(self, config_data: Dict[str, Any] = None): + """Saves the current configuration to the JSON file.""" + data_to_save = config_data if config_data else self.config_data + with open(self.config_file_path, 'w') as f: + json.dump(data_to_save, f, indent=4) + print(f"Configuration saved to {self.config_file_path}") + + def get(self, key: str, default: Any = None) -> Any: + """Gets a configuration value by key.""" + return self.config_data.get(key, default) + + def set(self, key: str, value: Any): + """Sets a configuration value and saves the config.""" + if key in ["gemini_api_key", "oso_api_key", "github_token"]: + print(f"Warning: Attempted to set API key '{key}' in config file. API keys should be managed via .env file.") + return + self.config_data[key] = value + self.save_config() + + # --- API Key Getters --- + def get_gemini_api_key(self) -> str: + """Gets the Gemini API key directly from settings (environment).""" + return GEMINI_API_KEY + + def get_oso_api_key(self) -> str: + """Gets the OSO API key directly from settings (environment).""" + return OSO_API_KEY + + def get_github_token(self) -> str: + """Gets the GitHub token directly from settings (environment).""" + return GITHUB_TOKEN + + # --- Other Getters --- + def get_personas(self) -> List[Dict[str, str]]: + """Gets the list of personas directly from the personas.py module.""" + from .prompts.personas import PERSONAS + return PERSONAS + + # add_persona and remove_persona are removed as personas are managed in personas.py + + def is_test_mode(self) -> bool: + """Checks if test mode is enabled.""" + return self.get("test_mode", False) + + def get_test_mode_limit(self) -> int: + """Gets the limit for test mode.""" + return self.get("test_mode_limit", 5) + + def get_output_dir(self) -> Path: + return Path(self.get("output_dir", str(OUTPUT_DIR))) + + def get_batch_size_summaries(self) -> int: + return self.get("batch_size_summaries", 50) + + def get_batch_size_categorization(self) -> int: + return self.get("batch_size_categorization", 10) + + def get_categories(self) -> List[Dict[str, str]]: + """Gets the categories directly from the categories.py module.""" + from .prompts.categories import CATEGORIES + return CATEGORIES + + def get_category_names(self) -> List[str]: + """Gets the category names directly from the categories.py module.""" + from .prompts.categories import CATEGORY_NAMES + return CATEGORY_NAMES + + def get_summary_prompt_template(self) -> str: + return self.get("summary_prompt_template", "") + + def get_tags_prompt_template(self) -> str: + return self.get("tags_prompt_template", "") + +if __name__ == "__main__": + # Example usage: + config_manager = ConfigManager() + print(f"Output Directory: {config_manager.get_output_dir()}") + print(f"Test Mode: {config_manager.is_test_mode()}") + # Example active print for personas: + print("\nPersonas (from personas.py):") + for p in config_manager.get_personas(): + print(f"- {p['name']}: {p['title']}") + + print("\nCategories (from categories.py):") + for cat_name in config_manager.get_category_names(): + print(f"- {cat_name}") diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/__init__.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/__init__.py new file mode 100644 index 00000000..c22d1787 --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/__init__.py @@ -0,0 +1,11 @@ +from .categories import CATEGORIES, CATEGORY_NAMES +from .personas import PERSONAS +from .summary_prompts import SUMMARY_PROMPT, TAGS_PROMPT + +__all__ = [ + 'CATEGORIES', + 'CATEGORY_NAMES', + 'PERSONAS', + 'SUMMARY_PROMPT', + 'TAGS_PROMPT', +] diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/categories.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/categories.py new file mode 100644 index 00000000..bd201d31 --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/categories.py @@ -0,0 +1,209 @@ +CATEGORIES = [ + # DeFi Application Categories + { + "category": "Lending & Borrowing Protocols", + "description": ( + "Lending & Borrowing Protocols include implementations and SDKs for collateralized lending markets, " + "flash loans, interest rate models, and liquidation mechanisms. These tools handle asset management, " + "risk scoring, and pool accounting, enabling users to lend or borrow assets in a trust-minimized way." + ) + }, + { + "category": "Decentralized Exchanges (DEXs)", + "description": ( + "DEXs power peer-to-peer asset swaps and liquidity provision. This includes AMM (automated market maker) " + "frameworks, order book DEXes, routers, aggregators, and liquidity management libraries. They also often " + "support advanced trading mechanisms like TWAPs, limit orders, and MEV protection." + ) + }, + { + "category": "Derivatives & Synthetic Assets", + "description": ( + "Derivatives & Synthetic Assets frameworks implement perpetual futures, options, and collateralized synthetic " + "asset systems. These toolkits involve complex pricing oracles, risk engines, margin systems, and settlement layers." + ) + }, + { + "category": "Stablecoin Infrastructure", + "description": ( + "Stablecoin Infrastructure includes minting contracts, collateralization engines, algorithmic stabilization mechanisms, " + "and off-chain attestation integrations. It also encompasses tools for analyzing backing ratios and peg health." + ) + }, + { + "category": "Oracles & Price Feeds", + "description": ( + "Oracles & Price Feeds provide real-world and cross-chain data into smart contracts. This category covers push-based oracles, " + "pull-based on-demand queries, cryptoeconomic staking oracles, and off-chain data relayers." + ) + }, + { + "category": "Vaults, Yield Strategies & Aggregators", + "description": ( + "These tools optimize capital across yield-bearing protocols. They include yield routers, auto-compounding vaults, and rebalancers, " + "as well as SDKs to model risk-return profiles and dynamically allocate capital across farms and lending markets." + ) + }, + { + "category": "Asset Management & Portfolio Tooling", + "description": ( + "Asset Management tooling includes interfaces and libraries for building rebalancing strategies, vault-based funds, on-chain ETFs, " + "and automated index trackers. They often incorporate fee structures, role-based access, and compliance checks." + ) + }, + { + "category": "DeFi Security & Monitoring", + "description": ( + "Security tools for DeFi include real-time exploit detectors, anomaly detection systems, pause mechanisms, multisig enforcers, " + "and post-mortem forensic tools. Monitoring dashboards and alerting frameworks fall here as well." + ) + }, + { + "category": "Governance & DAO Tooling", + "description": ( + "Governance & DAO Tooling enables on-chain proposal management, token-weighted voting, off-chain signaling, execution queues, " + "and guardrails for DeFi governance systems. Includes snapshot integration, timelocks, and delegate management interfaces." + ) + }, + { + "category": "Liquidity Bootstrapping & Token Distribution", + "description": ( + "This includes tools for liquidity mining, airdrops, vesting contracts, bonding curves, and initial token offerings. " + "They facilitate community-led distribution, price discovery, and progressive decentralization of DeFi protocols." + ) + }, + { + "category": "DeFi Analytics & Dashboards", + "description": ( + "These are SDKs, APIs, and frontends for aggregating on-chain DeFi metrics—TVL, yield, volume, and user activity. " + "Includes data pipelines, Dune-compatible libraries, subgraphs, and event-based ETL infrastructure tailored to DeFi." + ) + }, + { + "category": "Cross-chain DeFi Infrastructure", + "description": ( + "These tools support multi-chain liquidity routing, cross-chain yield farming, state relays, and synthetic asset issuance. " + "They abstract away bridging mechanics, offering seamless user and liquidity migration across ecosystems." + ) + }, + { + "category": "User Interface & Integration SDKs", + "description": ( + "SDKs and frontend libraries for integrating DeFi functionality into wallets, dApps, and aggregators. Includes trade UIs, " + "Zap interfaces, gas estimators, and batch transaction helpers to improve DeFi UX." + ) + }, + { + "category": "Simulation & Risk Modeling", + "description": ( + "Tools that simulate user positions, economic incentives, or protocol upgrades. They model protocol resilience, agent behavior, " + "market shocks, and contagion scenarios, often using agent-based or Monte Carlo methods for risk-aware design." + ) + }, + + # Developer Tool Categories + { + "category": "Language & Compilation Tools", + "description": ( + "Language & Compilation Tools include compilers, interpreters, language servers, " + "and syntax utilities for smart-contract development. They translate high-level " + "source code into EVM bytecode, perform static analysis, and enable features like " + "symbolic execution, forming the foundation for all higher-level tooling." + ) + }, + { + "category": "Core Protocol Interfaces", + "description": ( + "Core Protocol Interfaces are libraries and SDKs that provide reusable building blocks " + "for blockchain developers—smart contract libraries, JSON-RPC clients, transaction builders, " + "wallet and key management, authorization, signature handling, and ABI encoding/decoding. " + "They can power the core operations of many dApps and services." + ) + }, + { + "category": "Development Frameworks", + "description": ( + "Development Frameworks are opinionated, end-to-end toolchains that scaffold, build, " + "test, and deploy smart-contract projects. They bundle CLIs, IDE integrations, task " + "runners, local networks, hot-reloading, and plugin ecosystems to enforce conventions " + "and automate workflows from project setup through to frontend integration." + ) + }, + { + "category": "Deployment & Lifecycle Management", + "description": ( + "Deployment & Lifecycle Management tools handle contract deployment, upgrades, and " + "on-chain migrations. They automate predictable CREATE2 strategies, proxy pattern " + "management, cross-network publishes, and governance hooks, while integrating safety " + "checks and test-suite validations to maintain contract integrity." + ) + }, + { + "category": "Testing & Verification Tools", + "description": ( + "Testing & Verification Tools provide frameworks for unit testing, property-based fuzzing, " + "symbolic execution, formal verification, and coverage analysis. They integrate vulnerability " + "scanners, static analyzers, and coverage reporters to identify edge-case failures and ensure " + "on-chain correctness." + ) + }, + { + "category": "Developer Experience Tools", + "description": ( + "Developer Experience Tools are lightweight plugins and utilities that boost productivity " + "and enforce code consistency. This category includes editor extensions, linters, formatters, " + "code generators, documentation generators, and small CLI helpers." + ) + }, + { + "category": "Infrastructure & Node Operations", + "description": ( + "Infrastructure & Node Operations encompass tools for running, coordinating, and scaling " + "blockchain nodes and peer-to-peer networks. They cover RPC providers, telemetry collectors, " + "log aggregators, gossip-based messaging layers, peer discovery and connection management, " + "and automation scripts to ensure reliable network participation." + ) + }, + { + "category": "Data Indexing & Analytics", + "description": ( + "Data Indexing & Analytics tools ingest, process, and visualize on-chain data. They provide " + "GraphQL and REST APIs over processed datasets, real-time event streaming, and libraries or " + "dashboards for analyzing blockchain metrics." + ) + }, + { + "category": "Interoperability & Cross-chain", + "description": ( + "Interoperability & Cross-chain covers bridging frameworks, cross-chain messaging protocols, " + "and Superchain interoperability tooling. These libraries enable seamless asset transfers, " + "state proofs, and communication across multiple networks." + ) + }, + { + "category": "Cryptography & Primitives", + "description": ( + "Cryptography & Primitives includes low-level cryptographic libraries and building blocks—" + "hash functions, signature schemes, Merkle trees, zero-knowledge proof primitives, and " + "encryption utilities—optimized for security and performance." + ) + }, + { + "category": "Application-Specific & Niche Tools", + "description": ( + "Application-Specific & Niche Tools are libraries and SDKs tailored to very narrow use cases " + "(e.g., DeFi adapters, NFT marketplaces, governance dashboards). They serve specific projects " + "but do not have broad applicability or reusability across the ecosystem." + ) + }, + { + "category": "Others", + "description": ( + "Others is a catch-all for repositories with limited usage or insufficient information—" + "empty projects, single-file utilities, or items that cannot be reasonably categorized." + ) + } +] + +# Create a list of category names for easy access +CATEGORY_NAMES = [cat["category"] for cat in CATEGORIES] diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/personas.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/personas.py new file mode 100644 index 00000000..209515cf --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/personas.py @@ -0,0 +1,110 @@ +PERSONAS = [ + { + "name": "protocol_architect", + "title": "Protocol & Infrastructure Architect", + "description": ( + "You evaluate projects based on their technical architecture, infrastructure role, " + "and protocol design patterns. You focus on how well the project implements DeFi primitives, " + "contributes to ecosystem stability, and maintains technical dependencies." + ), + "prompt": ( + "As a Protocol & Infrastructure Architect, analyze the project's technical foundations, " + "infrastructure role, and protocol design.\n\n" + "Summary: {summary}\n" + "Stars: {star_count} | Forks: {fork_count}\n" + "Created: {created_at} | Updated: {updated_at}\n\n" + "Based on the technical architecture, infrastructure contribution, and protocol design, " + "choose one of the categories below:\n" + "{categories}\n\n" + "Respond in JSON:\n" + "{{\n" + ' "assigned_tag": "category name",\n' + ' "reason": "analysis of protocol architecture, infrastructure role, technical dependencies, and ecosystem stability"\n' + "}}" + ), + }, + { + "name": "ecosystem_analyst", + "title": "Ecosystem Growth Analyst", + "description": ( + "You assess projects based on their potential to grow the Ethereum DeFi ecosystem, " + "their user adoption metrics, and their contribution to composability and innovation." + ), + "prompt": ( + "As an Ecosystem Growth Analyst, evaluate the project's impact on DeFi ecosystem growth.\n\n" + "Summary: {summary}\n" + "Stars: {star_count} | Forks: {fork_count}\n" + "Created: {created_at} | Updated: {updated_at}\n\n" + "Select the category that best represents its ecosystem role:\n" + "{categories}\n\n" + "Respond in JSON:\n" + "{{\n" + ' "assigned_tag": "category name",\n' + ' "reason": "analysis of ecosystem impact, adoption potential, and composability"\n' + "}}" + ), + }, + { + "name": "security_researcher", + "title": "Security & Risk Researcher", + "description": ( + "You focus on security practices, risk management approaches, and the project's " + "contribution to making DeFi safer and more resilient." + ), + "prompt": ( + "As a Security & Risk Researcher, assess the project's security posture and risk management.\n\n" + "Summary: {summary}\n" + "Stars: {star_count} | Forks: {fork_count}\n" + "Created: {created_at} | Updated: {updated_at}\n\n" + "Choose the category that best reflects its security and risk management approach:\n" + "{categories}\n\n" + "Respond in JSON:\n" + "{{\n" + ' "assigned_tag": "category name",\n' + ' "reason": "analysis of security practices, risk management, and safety features"\n' + "}}" + ), + }, + { + "name": "user_experience_advocate", + "title": "User Experience Advocate", + "description": ( + "You evaluate projects based on their user experience, accessibility, and potential " + "to onboard new users to DeFi. You focus on usability and integration capabilities." + ), + "prompt": ( + "As a User Experience Advocate, assess the project's usability and accessibility.\n\n" + "Summary: {summary}\n" + "Stars: {star_count} | Forks: {fork_count}\n" + "Created: {created_at} | Updated: {updated_at}\n\n" + "Select the category that best represents its user experience focus:\n" + "{categories}\n\n" + "Respond in JSON:\n" + "{{\n" + ' "assigned_tag": "category name",\n' + ' "reason": "analysis of user experience, accessibility, and onboarding potential"\n' + "}}" + ), + }, + { + "name": "governance_specialist", + "title": "Governance & Decentralization Specialist", + "description": ( + "You analyze projects based on their governance mechanisms, decentralization approach, " + "and contribution to sustainable protocol management." + ), + "prompt": ( + "As a Governance & Decentralization Specialist, evaluate the project's governance model.\n\n" + "Summary: {summary}\n" + "Stars: {star_count} | Forks: {fork_count}\n" + "Created: {created_at} | Updated: {updated_at}\n\n" + "Choose the category that best reflects its governance and decentralization approach:\n" + "{categories}\n\n" + "Respond in JSON:\n" + "{{\n" + ' "assigned_tag": "category name",\n' + ' "reason": "analysis of governance mechanisms, decentralization, and sustainability"\n' + "}}" + ), + } +] diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/summary_prompts.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/summary_prompts.py new file mode 100644 index 00000000..cdd27bfb --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/prompts/summary_prompts.py @@ -0,0 +1,32 @@ +SUMMARY_PROMPT = ( + "You are an analyst preparing short, neutral briefs on open-source projects. " + "Read the README below and write a **concise, 2- to 3-sentence summary** that:\n" + "• states the project’s core purpose / problem it solves\n" + "• lists its main capabilities or components (1–3 key points only)\n" + "• mentions the primary intended users or systems (e.g., smart-contract developers, node operators)\n" + "• notes any strongly signalled context such as supported programming language, network, or runtime\n" + "\n" + "**Style constraints**\n" + "• Use plain, factual language in third person (no hype, no marketing adjectives).\n" + "• **Do not** guess or invent details that are not explicit in the README.\n" + "• **Do not** label the project with, or copy wording from, the taxonomy below (to avoid category leakage).\n" + "• Limit the summary to <100 words; avoid bullet lists or line breaks.\n" + "\n" + "Return your answer as **exactly one valid JSON object** in this form (nothing extra):\n" + "{{\n" + ' \"summary\": \"your summary here\"\n' + "}}\n" + "\n" + "README:\n" + "{readme_md}" +) + +TAGS_PROMPT = ( + "Based on this project summary, generate a list of relevant tags that " + "describe the project's purpose and functionality.\n\n" + "You must respond with a valid JSON object in this exact format:\n" + "{{\n" + ' "tags": ["tag1", "tag2", "tag3"]\n' + "}}\n\n" + "Summary:\n{summary}" +) diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/settings.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/settings.py new file mode 100644 index 00000000..79c7ebb0 --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/config/settings.py @@ -0,0 +1,26 @@ +import os +from pathlib import Path +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() + +# API Keys +OSO_API_KEY = os.getenv("OSO_API_KEY") +GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") +GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") + +# Project paths +PROJECT_ROOT = Path(__file__).parent.parent.parent +DATA_DIR = PROJECT_ROOT / "data" +OUTPUT_DIR = PROJECT_ROOT / "output" + +# Create directories if they don't exist +DATA_DIR.mkdir(exist_ok=True) +OUTPUT_DIR.mkdir(exist_ok=True) + +# GitHub API settings +GITHUB_HEADERS = {"Authorization": f"token {GITHUB_TOKEN}"} + +# Gemini model settings +GEMINI_MODEL = "gemini-2.0-flash" \ No newline at end of file diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/__init__.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/__init__.py new file mode 100644 index 00000000..51e9f286 --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/__init__.py @@ -0,0 +1,15 @@ +# This file makes the 'pipeline' directory a Python package. + +from .data_manager import DataManager +from .repository_fetcher import RepositoryFetcherStep +from .summary_generator import SummaryGeneratorStep +from .categorizer import CategorizerStep +from .consolidator import ConsolidatorStep + +__all__ = [ + "DataManager", + "RepositoryFetcherStep", + "SummaryGeneratorStep", + "CategorizerStep", + "ConsolidatorStep", +] diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/categorizer.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/categorizer.py new file mode 100644 index 00000000..67762e9a --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/categorizer.py @@ -0,0 +1,172 @@ +import pandas as pd +from tqdm import tqdm +from .data_manager import DataManager +from ..config.config_manager import ConfigManager +from ..processing.ai_service import AIService, ClassificationOutput + +class CategorizerStep: + def __init__(self, data_manager: DataManager, config_manager: ConfigManager, ai_service: AIService): + self.data_manager = data_manager + self.config_manager = config_manager + self.ai_service = ai_service + + def run(self, force_refresh: bool = False, target_persona_name: str = None, new_only: bool = False): + """ + Categorize projects using AI personas. + Uses batch_size_categorization from config. + + Args: + force_refresh: If True, wipe existing categories and regenerate all + target_persona_name: If specified, only process this persona + new_only: If True, only categorize repositories that don't have categories yet + """ + batch_size = self.config_manager.get_batch_size_categorization() + + if force_refresh: + if target_persona_name: + print(f"Force refresh enabled for persona '{target_persona_name}'. Wiping existing category data for this persona.") + self.data_manager.wipe_categories_data(persona_name=target_persona_name) + else: + print("Force refresh enabled for all personas. Wiping all existing category data.") + self.data_manager.wipe_categories_data() + + # Get summaries data + summaries_df = self.data_manager.get_summaries_data() + if summaries_df.empty: + print("No summarized data found to categorize. Skipping.") + return pd.DataFrame() + + if 'summary' not in summaries_df.columns: + print("Error: 'summary' column not found in summarized data. Cannot categorize.") + return pd.DataFrame() + if 'repo_artifact_id' not in summaries_df.columns: + print("Error: 'repo_artifact_id' not found in summarized data.") + return pd.DataFrame() + + # Get personas to process + personas_to_process = [] + if target_persona_name: + persona = self.config_manager.get_persona(target_persona_name) + if persona: + personas_to_process = [persona] + else: + print(f"Error: Persona '{target_persona_name}' not found.") + return pd.DataFrame() + else: + personas_to_process = self.config_manager.get_personas() + + if not personas_to_process: + print("No personas found to process.") + return pd.DataFrame() + + # Process each persona + for persona in personas_to_process: + persona_name = persona['name'] + print(f"\nProcessing persona: {persona_name}") + + # Get existing categories for this persona if any + existing_categories_df = pd.DataFrame() + if not force_refresh: + try: + existing_categories_df = self.data_manager.get_categories_data(persona_name) + except FileNotFoundError: + pass # No existing categories for this persona + + # If we have existing categories and not forcing refresh + if not existing_categories_df.empty and not force_refresh: + if new_only: + # Filter out repositories that already have categories + existing_repos = set(existing_categories_df['repo_artifact_id']) + repos_to_process = summaries_df[~summaries_df['repo_artifact_id'].isin(existing_repos)] + if repos_to_process.empty: + print(f"No new repositories found to categorize for persona '{persona_name}'.") + continue + print(f"Found {len(repos_to_process)} new repositories to categorize for persona '{persona_name}'.") + else: + print(f"Categories already exist for persona '{persona_name}' and force_refresh is false. Skipping.") + continue + else: + repos_to_process = summaries_df + + # Process in batches + all_categorized_data = [] + for start_idx in tqdm(range(0, len(repos_to_process), batch_size), desc=f"Categorizing ({persona_name})", leave=False): + end_idx = min(start_idx + batch_size, len(repos_to_process)) + batch_df = repos_to_process.iloc[start_idx:end_idx] + + # Prepare list of dicts, each containing summary and metadata for a project + project_data_batch = [] + required_metadata_cols = ['star_count', 'fork_count', 'created_at', 'updated_at'] + for _, row in batch_df.iterrows(): + project_data = { + 'summary': row.get('summary', ''), + 'repo_artifact_id': row.get('repo_artifact_id', 'UNKNOWN_ID') + } + for col in required_metadata_cols: + project_data[col] = row.get(col) # Will be None if missing, pandas NaT for dates + project_data_batch.append(project_data) + + if not project_data_batch or all(not item['summary'] for item in project_data_batch): + print(f"Skipping batch for {persona_name} as all summaries are effectively empty.") + classifications = [ClassificationOutput(assigned_tag="N/A", reason="Empty summary or batch")] * len(project_data_batch) + else: + classifications: List[ClassificationOutput] = self.ai_service.classify_projects_batch_for_persona( + project_data_batch, + persona + ) + + # Create a temporary DataFrame for this batch's results + temp_batch_df = batch_df.copy() + temp_batch_df[f"{persona_name}_tag"] = [c.assigned_tag for c in classifications] + temp_batch_df[f"{persona_name}_reason"] = [c.reason for c in classifications] + all_categorized_data.append(temp_batch_df) + + if not all_categorized_data: + print(f"No categories were generated for persona '{persona_name}'.") + continue + + new_categories_df = pd.concat(all_categorized_data, ignore_index=True) + + # If we have existing categories and not forcing refresh, combine with new ones + if not existing_categories_df.empty and not force_refresh: + final_categories_df = pd.concat([existing_categories_df, new_categories_df], ignore_index=True) + # Remove any duplicates that might have been introduced + final_categories_df = final_categories_df.drop_duplicates( + subset=['repo_artifact_id'], + keep='last' # Keep the new categorization if there was a duplicate + ) + print(f"Combined data now contains {len(final_categories_df)} repositories with categories for persona '{persona_name}'.") + else: + final_categories_df = new_categories_df + + self.data_manager.save_categories_data(final_categories_df, persona_name) + + return pd.DataFrame() # Return empty DataFrame as we've saved the data + + +if __name__ == '__main__': + # Example Usage + cfg_manager = ConfigManager() + ai_svc = AIService(config_manager=cfg_manager) + output_dir = cfg_manager.get_output_dir() + dt_manager = DataManager(output_dir=output_dir, config=cfg_manager) + + if dt_manager.get_summaries_data().empty: + print("No summarized data found. Please run SummaryGeneratorStep first or ensure data exists.") + else: + categorizer_step = CategorizerStep( + data_manager=dt_manager, + config_manager=cfg_manager, + ai_service=ai_svc + ) + print("\nRunning CategorizerStep...") + # Set force_refresh=True to re-categorize. + # Specify target_persona_name="keyword_spotter" to only run for one. + categorized_data = categorizer_step.run(force_refresh=False, target_persona_name=None) + + if not categorized_data.empty: + print(f"\nCategorized data head:\n{categorized_data.head()}") + print(f"Number of rows in categorized data: {len(categorized_data)}") + print(f"Columns: {categorized_data.columns.tolist()}") + else: + print("No data returned from categorization step.") diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/consolidator.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/consolidator.py new file mode 100644 index 00000000..e4fe539f --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/consolidator.py @@ -0,0 +1,183 @@ +import pandas as pd +import numpy as np +from .data_manager import DataManager +from ..config.config_manager import ConfigManager +# from ..config.prompts.tag_mappings import TAG_TO_CATEGORY # Removed + +class ConsolidatorStep: + def __init__(self, data_manager: DataManager, config_manager: ConfigManager): + self.data_manager = data_manager + self.config_manager = config_manager + + def run(self): + """Consolidate and analyze the classification results from all personas.""" + print("\nConsolidating analysis...") + + # Get the merged data from all personas + # DataManager's get_categories_data() without persona_name should provide this. + categorized_df = self.data_manager.get_categories_data() + + if categorized_df.empty: + print("No categorized data found to consolidate. Skipping.") + return pd.DataFrame() + + # Ensure essential columns are present + if 'repo_artifact_id' not in categorized_df.columns and 'project_id' not in categorized_df.columns: + print("Error: 'repo_artifact_id' or 'project_id' not found in categorized data.") + return pd.DataFrame() + + # Use 'project_id' for grouping if available, else 'repo_artifact_id' + # The original code used 'project_id' for project-level aggregation. + # The raw data from OSO has 'project_id'. Summaries and categories should retain it. + + # Identify persona tag columns + personas = self.config_manager.get_personas() + persona_tag_cols = [f"{persona['name']}_tag" for persona in personas if f"{persona['name']}_tag" in categorized_df.columns] + + if not persona_tag_cols: + print("No persona tag columns found in the categorized data. Cannot consolidate.") + return categorized_df # Return as is, or an empty DF + + # Fill NaNs in numeric columns that might be used for weighting (e.g., star_count) + # These columns should ideally come from the raw_repos_data or summaries_data. + # The categorized_df from DataManager should already have these if merged correctly. + numeric_cols_to_fill = ['star_count', 'fork_count', 'num_packages_in_deps_dev'] + for col in numeric_cols_to_fill: + if col in categorized_df.columns: + categorized_df[col] = categorized_df[col].fillna(0) + else: + # If star_count is missing, we can't do weighted summary as originally designed. + # For now, we'll proceed without it if missing. + print(f"Warning: Column '{col}' not found for consolidation. Weighted summary might be affected.") + + # Drop readme_md if it exists, as it's large and not needed for consolidation + if 'readme_md' in categorized_df.columns: + categorized_df = categorized_df.drop(columns=['readme_md']) + + # Group by project_id to consolidate recommendations + # Define grouping keys. project_id is essential. + grouping_keys = ['project_id'] + # Add other descriptive columns that should be unique per project or take the first + if 'display_name' in categorized_df.columns: grouping_keys.append('display_name') + if 'atlas_id' in categorized_df.columns: grouping_keys.append('atlas_id') + + # Ensure grouping keys are valid and exist in the DataFrame + valid_grouping_keys = [key for key in grouping_keys if key in categorized_df.columns] + if 'project_id' not in valid_grouping_keys: + print("Critical error: 'project_id' is missing. Cannot perform project-level consolidation.") + # Save the repo-level data with repo-level recommendations if project_id is missing + # This part re-uses the previous logic for repo-level recommendation if grouping fails + repo_recommendations = [] + if not categorized_df.empty and persona_tag_cols: + for index, row in categorized_df.iterrows(): + assignments = [row[col] for col in persona_tag_cols if pd.notna(row[col]) and row[col] not in ["Error", "N/A", "Other"]] + if assignments: + mode_series = pd.Series(assignments).mode() + repo_recommendations.append(mode_series[0] if not mode_series.empty else 'Other') + else: + repo_recommendations.append('Other') + categorized_df['recommendation'] = repo_recommendations + else: + categorized_df['recommendation'] = 'Other' + self.data_manager.save_consolidated_data(categorized_df) + print("Consolidated analysis saved (repo-level due to missing project_id).") + return categorized_df + + print(f"Consolidating at project level using keys: {valid_grouping_keys}") + + def aggregate_project_data(group): + # New logic for star-weighted recommendation + category_star_weights = {} # Stores sum of stars for each category + + for _, repo_row in group.iterrows(): # Iterate over each repo in the project + stars = repo_row.get('star_count', 0) # star_count was already filled with 0 for NaNs + + # Ensure stars is a non-negative number (already handled by fillna(0) but good practice) + if pd.isna(stars) or not isinstance(stars, (int, float)) or stars < 0: + stars = 0 + else: + stars = int(stars) # Ensure it's an integer for summation + + for p_col in persona_tag_cols: # Iterate over each persona's tag column + category = repo_row.get(p_col) + # Check if category is valid + if pd.notna(category) and category not in ["Error", "N/A", "Other"]: + category_star_weights[category] = category_star_weights.get(category, 0) + stars + + if not category_star_weights: + recommendation = 'Other' + else: + # Find the category with the maximum accumulated star weight + # pd.Series(category_star_weights).idxmax() returns the category (index) with the max value + recommendation = pd.Series(category_star_weights).idxmax() + + # Aggregate other fields + agg_data = { + 'recommendation': recommendation, + 'repo_artifact_namespaces': list(group['repo_artifact_namespace'].unique()) if 'repo_artifact_namespace' in group else [], + 'repo_count': group['repo_artifact_id'].nunique() if 'repo_artifact_id' in group else 0, + 'total_stars': group['star_count'].sum() if 'star_count' in group else 0, + 'total_forks': group['fork_count'].sum() if 'fork_count' in group else 0, + # Add summaries of the top N repos or a combined summary if needed + # For now, let's take the summary of the first repo in the group (by original order) + 'sample_summary': group['summary'].iloc[0] if 'summary' in group and not group['summary'].empty else "" + } + # Add persona tags for the project (e.g., mode of each persona's tags for this project) + for p_col in persona_tag_cols: + persona_project_tags = group[p_col].dropna().tolist() + valid_persona_tags = [tag for tag in persona_project_tags if tag not in ["Error", "N/A", "Other"]] + if valid_persona_tags: + agg_data[f"{p_col}_mode"] = pd.Series(valid_persona_tags).mode()[0] if pd.Series(valid_persona_tags).mode().any() else "N/A" + else: + agg_data[f"{p_col}_mode"] = "N/A" + + return pd.Series(agg_data) + + # Group by valid_grouping_keys and apply aggregation + # Use as_index=False if valid_grouping_keys are to be columns, otherwise they become index + project_consolidated_df = categorized_df.groupby(valid_grouping_keys, as_index=False).apply(aggregate_project_data) + + # If groupby().apply() changes the structure unexpectedly (e.g. multi-index if as_index=True was used) + # ensure project_consolidated_df is flat. With as_index=False, it should be. + # If aggregate_project_data returns a Series, and groupby has as_index=False, + # the result should be a DataFrame where grouping keys are columns, and new columns from Series. + # If apply returns a DataFrame, it might need reset_index(). + # Let's ensure it's flat: + if not isinstance(project_consolidated_df.index, pd.RangeIndex): + project_consolidated_df = project_consolidated_df.reset_index() + + + final_df = project_consolidated_df + + # Save results + print(f"\nSaving consolidated analysis (project-level)...") + self.data_manager.save_consolidated_data(final_df) + print("Consolidated analysis saved successfully.") + return final_df + +if __name__ == '__main__': + # Example Usage + cfg_manager = ConfigManager() + output_dir = cfg_manager.get_output_dir() + dt_manager = DataManager(output_dir=output_dir, config=cfg_manager) + + # Ensure categorized data exists (run categorizer.py example first if needed) + # DataManager's get_categories_data() should merge individual persona files. + if dt_manager.get_categories_data().empty: + print("No categorized data found. Please run CategorizerStep first or ensure data exists.") + else: + consolidator_step = ConsolidatorStep( + data_manager=dt_manager, + config_manager=cfg_manager + ) + print("\nRunning ConsolidatorStep...") + consolidated_df = consolidator_step.run() + + if not consolidated_df.empty: + print(f"\nConsolidated data head:\n{consolidated_df.head()}") + print(f"Number of rows in consolidated data: {len(consolidated_df)}") + print(f"Consolidated columns: {consolidated_df.columns.tolist()}") + print(f"\nRecommendations sample:\n{consolidated_df[['project_id', 'display_name', 'recommendation']].head() if 'project_id' in consolidated_df.columns and 'display_name' in consolidated_df.columns else consolidated_df['recommendation'].head()}") + + else: + print("No data returned from consolidation step.") diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/data_manager.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/data_manager.py new file mode 100644 index 00000000..035e1caf --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/data_manager.py @@ -0,0 +1,391 @@ +import pandas as pd +import json +from pathlib import Path +import shutil +from typing import List, Dict, Any +from ..config.settings import PROJECT_ROOT + +class DataManager: + def __init__(self, output_dir: Path, config=None): + self.output_dir = output_dir + self.config = config # For future use, e.g., different storage backends + + # Legacy paths + self.raw_parquet_path = self.output_dir / "devtooling_raw.parquet" + self.summarized_parquet_path = self.output_dir / "devtooling_summarized.parquet" + self.categorized_dir = self.output_dir / "categorized" + self.final_parquet_path = self.output_dir / "devtooling_full.parquet" + self.consolidated_csv_path = self.output_dir / "devtooling_consolidated.csv" + + # New unified data paths - ensure they're in the current repo's output directory + local_output_dir = Path(PROJECT_ROOT) / "output" + local_output_dir.mkdir(parents=True, exist_ok=True) + self.unified_parquet_path = local_output_dir / "ethereum_repos_unified.parquet" + self.unified_csv_path = local_output_dir / "ethereum_repos_unified.csv" + + # Create directories if they don't exist + self.output_dir.mkdir(parents=True, exist_ok=True) + self.categorized_dir.mkdir(parents=True, exist_ok=True) + + def get_repos_data(self) -> pd.DataFrame: + """Get the latest repository data""" + if self.raw_parquet_path.exists(): + return pd.read_parquet(self.raw_parquet_path) + return pd.DataFrame() + + def get_summaries_data(self) -> pd.DataFrame: + """Get the latest summaries data""" + if self.summarized_parquet_path.exists(): + return pd.read_parquet(self.summarized_parquet_path) + return pd.DataFrame() + + def get_categories_data(self, persona_name: str = None) -> pd.DataFrame: + """Get the latest categories data, optionally for a specific persona or all.""" + if persona_name: + persona_file = self.categorized_dir / f"{persona_name}.parquet" + if persona_file.exists(): + return pd.read_parquet(persona_file) + return pd.DataFrame() + else: + # Combine all persona files + all_persona_dfs = [] + for persona_file in self.categorized_dir.glob("*.parquet"): + df = pd.read_parquet(persona_file) + all_persona_dfs.append(df) + + if not all_persona_dfs: + return pd.DataFrame() + + # Concatenate all dataframes. If a project appears in multiple files, + # the last one read will take precedence for shared columns (like 'summary'). + # Persona-specific columns (e.g., 'persona_X_tag') will be unique. + # We need a more robust way to merge these if there are overlapping non-persona columns. + # For now, assuming 'project_id' or 'repo_artifact_id' is the key. + + # A simple concat might lead to duplicate columns if not handled carefully. + # Let's assume each persona file has unique columns for its tags/reasons. + # And common columns like 'project_id', 'summary' are present. + + # Start with the summaries data as the base + base_df = self.get_summaries_data() + if base_df.empty: + # If no summaries, try to load from the first persona file as a base + if all_persona_dfs: + base_df = all_persona_dfs[0][['project_id', 'repo_artifact_id', 'summary']].copy() # Adjust columns as needed + else: + return pd.DataFrame() + + + # Set index for joining + if 'repo_artifact_id' in base_df.columns: + base_df = base_df.set_index('repo_artifact_id') + elif 'project_id' in base_df.columns: + base_df = base_df.set_index('project_id') + else: + # Fallback if no clear index, this might lead to issues + print("Warning: No clear index (project_id or repo_artifact_id) for merging category data.") + + + for df_persona in all_persona_dfs: + # Identify the persona name from its columns (e.g., "keyword_spotter_tag") + current_persona_name = None + for col_name in df_persona.columns: + if col_name.endswith("_tag"): + current_persona_name = col_name.replace("_tag", "") + break + + if not current_persona_name: + print(f"Warning: Could not determine persona name from columns in a categorized file. Skipping this file.") + continue + + # Columns to join are just the tag and reason for this specific persona + persona_tag_col = f"{current_persona_name}_tag" + persona_reason_col = f"{current_persona_name}_reason" + + cols_from_persona_df = [] + if persona_tag_col in df_persona.columns: + cols_from_persona_df.append(persona_tag_col) + if persona_reason_col in df_persona.columns: + cols_from_persona_df.append(persona_reason_col) + + if not cols_from_persona_df: + print(f"Warning: No tag/reason columns found for persona {current_persona_name} in its file. Skipping join for this persona.") + continue + + # Set index for df_persona before selecting columns for join + if base_df.index.name in df_persona.columns: # base_df.index.name is 'repo_artifact_id' or 'project_id' + df_persona_indexed = df_persona.set_index(base_df.index.name) + else: + print(f"Warning: Index column '{base_df.index.name}' not found in persona DataFrame for {current_persona_name}. Attempting join without re-indexing persona df, might be incorrect.") + df_persona_indexed = df_persona # This might lead to issues if not indexed properly + + # Ensure only existing columns are selected from df_persona_indexed + valid_cols_to_join = [col for col in cols_from_persona_df if col in df_persona_indexed.columns] + + if not valid_cols_to_join: + print(f"Warning: Persona specific columns {cols_from_persona_df} not found as actual columns in indexed persona dataframe for {current_persona_name}. Skipping join for this persona.") + continue + + base_df = base_df.join(df_persona_indexed[valid_cols_to_join], how='left', rsuffix=f'_{current_persona_name}_dup') + + # Clean up duplicate columns if any (this is a basic cleanup for rsuffix) + cols_to_drop = [col for col in base_df.columns if '_dup' in col] + base_df.drop(columns=cols_to_drop, inplace=True, errors='ignore') + + return base_df.reset_index() + + + def save_repos_data(self, data: pd.DataFrame): + """Save repository data""" + data.to_parquet(self.raw_parquet_path, index=False) + print(f"Repository data saved to {self.raw_parquet_path}") + + def save_summaries_data(self, data: pd.DataFrame, append: bool = False): + """Save summaries data. If append is True, appends to existing file if it exists.""" + if append and self.summarized_parquet_path.exists(): + existing_df = pd.read_parquet(self.summarized_parquet_path) + # Ensure no duplicate columns before concat, especially if 'summary' is regenerated + # A more robust merge/update might be needed depending on exact requirements + data_to_save = pd.concat([existing_df, data]).drop_duplicates(subset=['repo_artifact_id'], keep='last') # Assuming repo_artifact_id is unique key + else: + data_to_save = data + data_to_save.to_parquet(self.summarized_parquet_path, index=False) + print(f"Summaries data saved to {self.summarized_parquet_path}") + + def save_categories_data(self, data: pd.DataFrame, persona_name: str): + """Save categories data for a specific persona""" + persona_file = self.categorized_dir / f"{persona_name}.parquet" + data.to_parquet(persona_file, index=False) + print(f"Categories data for persona {persona_name} saved to {persona_file}") + + def save_consolidated_data(self, data: pd.DataFrame): + """Save consolidated data to Parquet and CSV""" + data.to_parquet(self.final_parquet_path, index=False) + print(f"Consolidated Parquet data saved to {self.final_parquet_path}") + data.to_csv(self.consolidated_csv_path, index=False) + print(f"Consolidated CSV data saved to {self.consolidated_csv_path}") + + def wipe_repos_data(self): + """Wipe repository data""" + if self.raw_parquet_path.exists(): + self.raw_parquet_path.unlink() + print(f"Wiped repository data: {self.raw_parquet_path}") + + def wipe_summaries_data(self): + """Wipe summaries data""" + if self.summarized_parquet_path.exists(): + self.summarized_parquet_path.unlink() + print(f"Wiped summaries data: {self.summarized_parquet_path}") + + def wipe_categories_data(self, persona_name: str = None): + """Wipe categories data, optionally for a specific persona or all.""" + if persona_name: + persona_file = self.categorized_dir / f"{persona_name}.parquet" + if persona_file.exists(): + persona_file.unlink() + print(f"Wiped categories data for persona {persona_name}: {persona_file}") + else: + if self.categorized_dir.exists(): + shutil.rmtree(self.categorized_dir) + self.categorized_dir.mkdir(parents=True, exist_ok=True) # Recreate after wiping + print(f"Wiped all categories data in {self.categorized_dir}") + + def has_categories_for_persona(self, persona_name: str) -> bool: + """Check if category data exists for a specific persona.""" + persona_file = self.categorized_dir / f"{persona_name}.parquet" + return persona_file.exists() + + def get_final_parquet_path(self) -> Path: + return self.final_parquet_path + + def get_consolidated_csv_path(self) -> Path: + return self.consolidated_csv_path + + # New methods for unified data structure + + def save_unified_data(self, data: pd.DataFrame): + """ + Save unified repository data to Parquet and CSV. + This data includes all repositories, summaries, and categorizations in a single structure. + """ + # Ensure categorizations column is properly serialized for Parquet + if 'categorizations' in data.columns: + # Convert categorizations to strings for storage + # This is necessary because Parquet doesn't handle complex nested structures well + data_copy = data.copy() + data_copy['categorizations_json'] = data_copy['categorizations'].apply( + lambda x: json.dumps(x) if isinstance(x, list) else '[]' + ) + + # Save to Parquet (without the original categorizations column) + parquet_data = data_copy.drop(columns=['categorizations']) + parquet_data.to_parquet(self.unified_parquet_path, index=False) + print(f"Unified data saved to {self.unified_parquet_path}") + + # Save to CSV for easier viewing (also without the complex column) + csv_data = parquet_data.copy() + + # Remove README text and truncate long text fields for CSV readability + if 'readme_md' in csv_data.columns: + csv_data = csv_data.drop(columns=['readme_md']) + + if 'summary' in csv_data.columns: + csv_data['summary'] = csv_data['summary'].apply( + lambda x: (x[:100] + '...') if isinstance(x, str) and len(x) > 100 else x + ) + + # Truncate other potentially long text fields + for col in ['categorizations_json']: + if col in csv_data.columns: + csv_data[col] = csv_data[col].apply( + lambda x: (x[:50] + '...') if isinstance(x, str) and len(x) > 50 else x + ) + + csv_data.to_csv(self.unified_csv_path, index=False) + print(f"Unified CSV data saved to {self.unified_csv_path} (README text removed)") + else: + # If no categorizations column, save as is + data.to_parquet(self.unified_parquet_path, index=False) + print(f"Unified data saved to {self.unified_parquet_path}") + + # Create a readable CSV version + csv_data = data.copy() + + # Remove README text and truncate long text fields for CSV readability + if 'readme_md' in csv_data.columns: + csv_data = csv_data.drop(columns=['readme_md']) + + if 'summary' in csv_data.columns: + csv_data['summary'] = csv_data['summary'].apply( + lambda x: (x[:100] + '...') if isinstance(x, str) and len(x) > 100 else x + ) + + csv_data.to_csv(self.unified_csv_path, index=False) + print(f"Unified CSV data saved to {self.unified_csv_path} (README text removed)") + + def get_unified_data(self) -> pd.DataFrame: + """ + Get the unified repository data with properly deserialized categorizations. + """ + if not self.unified_parquet_path.exists(): + return pd.DataFrame() + + # Load the data from Parquet + data = pd.read_parquet(self.unified_parquet_path) + + # Deserialize the categorizations from JSON if present + if 'categorizations_json' in data.columns: + data['categorizations'] = data['categorizations_json'].apply( + lambda x: json.loads(x) if isinstance(x, str) else [] + ) + data = data.drop(columns=['categorizations_json']) + + return data + + def append_unified_data(self, new_repo_data: pd.DataFrame) -> None: + """ + Append a single repository or multiple repositories to the existing unified data. + + Args: + new_repo_data: DataFrame containing the new repository data to append + """ + if new_repo_data.empty: + return + + existing_data = self.get_unified_data() + + if existing_data.empty: + # If no existing data, just save the new data + self.save_unified_data(new_repo_data) + return + + # Combine existing and new data + combined_data = pd.concat([existing_data, new_repo_data], ignore_index=True) + + # Remove duplicates based on repo_artifact_id, keeping the newest version + combined_data = combined_data.sort_values('processing_timestamp', ascending=False) + combined_data = combined_data.drop_duplicates(subset=['repo_artifact_id'], keep='first') + + # Save the combined data + self.save_unified_data(combined_data) + + def update_unified_data(self, updated_repo_data: pd.DataFrame) -> None: + """ + Update specific repositories in the existing unified data. + + Args: + updated_repo_data: DataFrame containing the updated repository data + """ + if updated_repo_data.empty: + return + + existing_data = self.get_unified_data() + + if existing_data.empty: + # If no existing data, just save the updated data + self.save_unified_data(updated_repo_data) + return + + # Get the repo_artifact_ids of the updated repositories + updated_ids = set(updated_repo_data['repo_artifact_id']) + + # Remove the repositories that are being updated from the existing data + filtered_existing = existing_data[~existing_data['repo_artifact_id'].isin(updated_ids)] + + # Combine the filtered existing data with the updated data + combined_data = pd.concat([filtered_existing, updated_repo_data], ignore_index=True) + + # Save the combined data + self.save_unified_data(combined_data) + + def wipe_unified_data(self): + """Wipe unified data files""" + if self.unified_parquet_path.exists(): + self.unified_parquet_path.unlink() + print(f"Wiped unified data: {self.unified_parquet_path}") + if self.unified_csv_path.exists(): + self.unified_csv_path.unlink() + print(f"Wiped unified CSV data: {self.unified_csv_path}") + + def get_checkpoint_path(self) -> Path: + """Get the path to the processing checkpoint file""" + local_output_dir = Path(PROJECT_ROOT) / "output" + local_output_dir.mkdir(parents=True, exist_ok=True) + return local_output_dir / "processing_checkpoint.json" + + def save_checkpoint(self, checkpoint_data: Dict[str, Any]) -> None: + """ + Save the processing checkpoint data to a JSON file. + + Args: + checkpoint_data: Dictionary containing checkpoint information + """ + checkpoint_path = self.get_checkpoint_path() + with open(checkpoint_path, 'w') as f: + json.dump(checkpoint_data, f, indent=2) + + def load_checkpoint(self) -> Dict[str, Any]: + """ + Load the processing checkpoint data from a JSON file. + + Returns: + Dictionary containing checkpoint information, or empty dict if no checkpoint exists + """ + checkpoint_path = self.get_checkpoint_path() + if not checkpoint_path.exists(): + return { + "last_processed_repo_id": None, + "processed_repos": [], + "partial_results": {} + } + + try: + with open(checkpoint_path, 'r') as f: + return json.load(f) + except Exception as e: + print(f"Error loading checkpoint: {e}") + return { + "last_processed_repo_id": None, + "processed_repos": [], + "partial_results": {} + } diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/repository_fetcher.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/repository_fetcher.py new file mode 100644 index 00000000..fc5f398f --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/repository_fetcher.py @@ -0,0 +1,125 @@ +import pandas as pd +from .data_manager import DataManager +from ..config.config_manager import ConfigManager +from ..processing.fetcher import DataFetcher + +class RepositoryFetcherStep: + def __init__(self, data_manager: DataManager, config_manager: ConfigManager): + self.data_manager = data_manager + self.config_manager = config_manager + self.fetcher = DataFetcher() # Assuming DataFetcher doesn't need config for initialization + + def run(self, force_refresh: bool = False, fetch_new_only: bool = False): + """ + Fetch repositories and READMEs. + Uses test_mode and test_mode_limit from config if test_mode is enabled. + + Args: + force_refresh: If True, wipe existing data and fetch everything fresh + fetch_new_only: If True, only fetch repositories that don't exist in current data + """ + limit = None + sort_by_stars_in_test = False + is_test = self.config_manager.is_test_mode() + + if is_test: + limit = self.config_manager.get_test_mode_limit() + sort_by_stars_in_test = True # Always sort by stars in test mode as per new req + print(f"Running in TEST MODE: Targeting up to {limit} repositories, sorted by stars DESC.") + + if force_refresh: + print("Force refresh enabled for repository data. Wiping existing raw data.") + self.data_manager.wipe_repos_data() + existing_df = pd.DataFrame() + else: + existing_df = self.data_manager.get_repos_data() + if not existing_df.empty: + if fetch_new_only: + print("Fetching only new repositories while keeping existing ones...") + else: + print("Repository data already exists and force_refresh is false.") + if is_test: + if 'star_count' in existing_df.columns: + print(f"Applying test mode (sort by stars, limit {limit}) to existing data.") + sorted_df = existing_df.sort_values(by='star_count', ascending=False) + return sorted_df.head(limit) + else: + print(f"Warning: 'star_count' not in existing data. Using first {limit} entries for test mode.") + return existing_df.head(limit) + return existing_df # Not test mode, return all existing + + # If here, either force_refresh is true or data doesn't exist. + print("Fetching repositories from OSO...") + # Pass sort_by_stars only if in test_mode, limit is passed anyway (None if not test) + new_repos_df = self.fetcher.fetch_repositories(limit=limit, sort_by_stars=sort_by_stars_in_test) + + if new_repos_df.empty: + print("No repositories found from OSO fetch.") + # Save an empty DataFrame to indicate the step ran + self.data_manager.save_repos_data(pd.DataFrame()) + return pd.DataFrame() + + print(f"Found {len(new_repos_df)} repositories from OSO.") + + if fetch_new_only and not existing_df.empty: + # Filter out repositories that already exist + existing_repos = set(zip(existing_df['repo_artifact_namespace'], existing_df['repo_artifact_name'])) + new_repos_df = new_repos_df[~new_repos_df.apply( + lambda x: (x['repo_artifact_namespace'], x['repo_artifact_name']) in existing_repos, + axis=1 + )] + print(f"Found {len(new_repos_df)} new repositories to process.") + + if new_repos_df.empty: + print("No new repositories to process.") + return existing_df + + print("Fetching READMEs from GitHub...") + # Ensure 'repo_artifact_namespace' and 'repo_artifact_name' exist + if 'repo_artifact_namespace' not in new_repos_df.columns or 'repo_artifact_name' not in new_repos_df.columns: + print("Error: 'repo_artifact_namespace' or 'repo_artifact_name' not in fetched data.") + # Save what we have so far + self.data_manager.save_repos_data(new_repos_df) + return new_repos_df # Or handle error more gracefully + + new_repos_df = self.fetcher.get_all_readmes(new_repos_df) + print(f"Retrieved READMEs for {len(new_repos_df[new_repos_df['readme_md'] != ''])} repositories.") + + # Combine existing and new data + if not existing_df.empty: + combined_df = pd.concat([existing_df, new_repos_df], ignore_index=True) + # Remove any duplicates that might have been introduced + combined_df = combined_df.drop_duplicates( + subset=['repo_artifact_namespace', 'repo_artifact_name'], + keep='first' + ) + print(f"Combined data now contains {len(combined_df)} repositories.") + self.data_manager.save_repos_data(combined_df) + + # If in test mode and combined data exceeds limit + if limit is not None and len(combined_df) > limit: + if 'star_count' in combined_df.columns: + return combined_df.sort_values(by='star_count', ascending=False).head(limit) + return combined_df.head(limit) + return combined_df + else: + self.data_manager.save_repos_data(new_repos_df) + # If in test mode and fetched more than limit + if limit is not None and len(new_repos_df) > limit: + return new_repos_df.head(limit) + return new_repos_df + +if __name__ == '__main__': + # Example Usage (requires .env file and OSO/GitHub credentials) + # Ensure pipeline_config.json exists or is created with defaults + cfg_manager = ConfigManager() + + output_dir = cfg_manager.get_output_dir() + dt_manager = DataManager(output_dir=output_dir, config=cfg_manager) + + repo_fetch_step = RepositoryFetcherStep(data_manager=dt_manager, config_manager=cfg_manager) + + print("\nRunning RepositoryFetcherStep...") + fetched_data = repo_fetch_step.run(force_refresh=False) # Set True to wipe and refetch + print(f"\nFetched data head:\n{fetched_data.head()}") + print(f"Number of rows fetched: {len(fetched_data)}") diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/summary_generator.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/summary_generator.py new file mode 100644 index 00000000..f09b0d78 --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/summary_generator.py @@ -0,0 +1,131 @@ +import pandas as pd +from tqdm import tqdm +from .data_manager import DataManager +from ..config.config_manager import ConfigManager +from ..processing.ai_service import AIService, SummaryOutput + +class SummaryGeneratorStep: + def __init__(self, data_manager: DataManager, config_manager: ConfigManager, ai_service: AIService): + self.data_manager = data_manager + self.config_manager = config_manager + self.ai_service = ai_service + + def run(self, force_refresh: bool = False, new_only: bool = False): + """ + Generate summaries for repositories. + Uses batch_size_summaries from config. + + Args: + force_refresh: If True, wipe existing summaries and regenerate all + new_only: If True, only generate summaries for repositories that don't have them yet + """ + batch_size = self.config_manager.get_batch_size_summaries() + + if force_refresh: + print("Force refresh enabled for summaries. Wiping existing summarized data.") + self.data_manager.wipe_summaries_data() + existing_summaries_df = pd.DataFrame() + else: + existing_summaries_df = self.data_manager.get_summaries_data() + + # Get repository data + repos_df = self.data_manager.get_repos_data() + if repos_df.empty: + print("No repository data found to generate summaries. Skipping.") + # Save an empty DataFrame to indicate the step ran if forced + if force_refresh or not self.data_manager.summarized_parquet_path.exists(): + self.data_manager.save_summaries_data(pd.DataFrame()) + return pd.DataFrame() + + # If we have existing summaries and not forcing refresh + if not existing_summaries_df.empty and not force_refresh: + if new_only: + # Filter out repositories that already have summaries + existing_repos = set(existing_summaries_df['repo_artifact_id']) + repos_to_process = repos_df[~repos_df['repo_artifact_id'].isin(existing_repos)] + if repos_to_process.empty: + print("No new repositories found to generate summaries for.") + return existing_summaries_df + print(f"Found {len(repos_to_process)} new repositories to generate summaries for.") + else: + print("Summarized data already exists and force_refresh is false. Skipping summary generation.") + return existing_summaries_df + else: + repos_to_process = repos_df + + # Ensure 'readme_md' and 'repo_artifact_id' columns exist + if 'readme_md' not in repos_to_process.columns: + print("Error: 'readme_md' column not found in repository data. Cannot generate summaries.") + return pd.DataFrame() + if 'repo_artifact_id' not in repos_to_process.columns: + print("Error: 'repo_artifact_id' column not found. This ID is crucial.") + return pd.DataFrame() + + print(f"Generating summaries for {len(repos_to_process)} repositories in batches of {batch_size}...") + + all_summaries_data = [] # To collect all rows with new summaries + + # Process in batches + for start_idx in tqdm(range(0, len(repos_to_process), batch_size), desc="Generating Summaries"): + end_idx = min(start_idx + batch_size, len(repos_to_process)) + batch_df_initial = repos_to_process.iloc[start_idx:end_idx] + + # Create a working copy for this batch to add summaries + batch_df_processed = batch_df_initial.copy() + + summaries = [] + for _, row in batch_df_initial.iterrows(): + readme_content = row.get('readme_md', "") + summary_output: SummaryOutput = self.ai_service.make_summary(readme_content) + summaries.append(summary_output.summary) + + batch_df_processed["summary"] = summaries + all_summaries_data.append(batch_df_processed) + + if not all_summaries_data: + print("No summaries were generated.") + # Save an empty DataFrame if no summaries were made but the step was intended to run + if force_refresh or not self.data_manager.summarized_parquet_path.exists(): + self.data_manager.save_summaries_data(pd.DataFrame()) + return pd.DataFrame() + + new_summaries_df = pd.concat(all_summaries_data, ignore_index=True) + + # If we have existing summaries and not forcing refresh, combine with new ones + if not existing_summaries_df.empty and not force_refresh: + final_summarized_df = pd.concat([existing_summaries_df, new_summaries_df], ignore_index=True) + # Remove any duplicates that might have been introduced + final_summarized_df = final_summarized_df.drop_duplicates( + subset=['repo_artifact_id'], + keep='last' # Keep the new summary if there was a duplicate + ) + print(f"Combined data now contains {len(final_summarized_df)} repositories with summaries.") + else: + final_summarized_df = new_summaries_df + + self.data_manager.save_summaries_data(final_summarized_df) + + return final_summarized_df + +if __name__ == '__main__': + # Example Usage + cfg_manager = ConfigManager() + ai_svc = AIService(config_manager=cfg_manager) + output_dir = cfg_manager.get_output_dir() + dt_manager = DataManager(output_dir=output_dir, config=cfg_manager) + + # Ensure repo data exists (run repo_fetcher.py example first if needed) + if dt_manager.get_repos_data().empty: + print("No repository data found. Please run RepositoryFetcherStep first or ensure data exists.") + else: + summary_gen_step = SummaryGeneratorStep( + data_manager=dt_manager, + config_manager=cfg_manager, + ai_service=ai_svc + ) + + print("\nRunning SummaryGeneratorStep...") + # Set force_refresh=True to regenerate even if file exists + summarized_data = summary_gen_step.run(force_refresh=False) + print(f"\nSummarized data head:\n{summarized_data.head()}") + print(f"Number of rows with summaries: {len(summarized_data)}") diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/unified_processor.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/unified_processor.py new file mode 100644 index 00000000..d669fa76 --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/pipeline/unified_processor.py @@ -0,0 +1,432 @@ +import pandas as pd +import datetime +import json +import time +from typing import List, Dict, Any, Optional, Set +from tqdm import tqdm +from .data_manager import DataManager +from ..config.config_manager import ConfigManager +from ..processing.ai_service import AIService, SummaryOutput, ClassificationOutput +from ..processing.fetcher import DataFetcher + +class UnifiedProcessor: + def __init__(self, data_manager: DataManager, config_manager: ConfigManager, ai_service: AIService): + self.data_manager = data_manager + self.config_manager = config_manager + self.ai_service = ai_service + self.fetcher = DataFetcher() + + def run(self, + force_refresh: bool = False, + include_forks: bool = False, + inactive_repos: bool = False, + limit: Optional[int] = None): + """ + Unified processing pipeline that fetches repositories, READMEs, generates summaries, + and categorizes them in a single pass. + + Args: + force_refresh: If True, wipe existing data and process everything fresh + include_forks: If True, include forked repositories in processing + inactive_repos: If True, include repositories not updated in the last year + limit: Optional limit on number of repositories to process + """ + # Get test mode settings + is_test = self.config_manager.is_test_mode() + if is_test: + test_limit = self.config_manager.get_test_mode_limit() + if limit is None or limit > test_limit: + limit = test_limit + print(f"Running in TEST MODE: Limiting to {limit} repositories, sorted by stars DESC.") + + # Determine batch sizes + batch_size = min( + self.config_manager.get_batch_size_summaries(), + self.config_manager.get_batch_size_categorization() + ) + + # Load checkpoint or initialize a new one + if force_refresh: + print("Force refresh enabled. Wiping existing data and checkpoint.") + self.data_manager.wipe_unified_data() + self._initialize_checkpoint() + existing_df = pd.DataFrame() + else: + existing_df = self.data_manager.get_unified_data() + if not existing_df.empty: + print(f"Found existing data with {len(existing_df)} repositories.") + + # Fetch repositories from OSO + print("Fetching repositories from OSO...") + repos_df = self.fetcher.fetch_repositories(limit=limit, sort_by_stars=True) + + if repos_df.empty: + print("No repositories found from OSO fetch.") + return pd.DataFrame() + + print(f"Found {len(repos_df)} repositories from OSO.") + + # Filter repositories based on parameters + if not include_forks: + repos_df = repos_df[~repos_df['is_fork']] + print(f"Filtered out forks. {len(repos_df)} repositories remaining.") + + if not inactive_repos: + repos_df = repos_df[repos_df['is_actively_maintained']] + print(f"Filtered out inactive repositories. {len(repos_df)} repositories remaining.") + + # Load checkpoint to determine which repositories need processing + checkpoint = self.data_manager.load_checkpoint() + processed_repos = set(checkpoint.get("processed_repos", [])) + + # Determine which repositories need processing + if not force_refresh: + # Filter out already processed repositories + repos_to_process = repos_df[~repos_df['repo_artifact_id'].isin(processed_repos)] + print(f"Found {len(repos_to_process)} repositories that need processing.") + + # Process the repositories + processed_df = self._process_repositories(repos_to_process, batch_size) + + # Return the combined data (existing + newly processed) + return self.data_manager.get_unified_data() + else: + # Process all repositories + processed_df = self._process_repositories(repos_df, batch_size) + return self.data_manager.get_unified_data() + + def _initialize_checkpoint(self): + """Initialize a new checkpoint file""" + checkpoint = { + "last_processed_repo_id": None, + "processed_repos": [], + "partial_results": {} + } + self.data_manager.save_checkpoint(checkpoint) + print("Initialized new processing checkpoint.") + + def _process_repositories(self, repos_df: pd.DataFrame, batch_size: int) -> pd.DataFrame: + """ + Process repositories in batches: fetch READMEs, generate summaries, and categorize. + + Args: + repos_df: DataFrame containing repositories to process + batch_size: Number of repositories to process in each batch + + Returns: + DataFrame with processed repositories + """ + print(f"Processing {len(repos_df)} repositories in batches of {batch_size}...") + + # Get personas for categorization + personas = self.config_manager.get_personas() + if not personas: + print("No personas found for categorization.") + return repos_df + + # Load checkpoint + checkpoint = self.data_manager.load_checkpoint() + processed_repos = set(checkpoint.get("processed_repos", [])) + partial_results = checkpoint.get("partial_results", {}) + + # Process in batches + all_processed_data = [] + + for start_idx in tqdm(range(0, len(repos_df), batch_size), desc="Processing Repositories"): + end_idx = min(start_idx + batch_size, len(repos_df)) + batch_df = repos_df.iloc[start_idx:end_idx].copy() + + # Process each repository in the batch + for idx, row in tqdm(batch_df.iterrows(), desc="Processing repositories in batch", total=len(batch_df), leave=False): + repo_id = row.get('repo_artifact_id') + repo_name = row.get('repo_artifact_name', 'repo') + + # Skip if already fully processed + if repo_id in processed_repos: + print(f"Skipping {repo_name} (already processed)") + continue + + # Get partial progress for this repository + partial = partial_results.get(repo_id, {}) + + # Initialize repository data + repo_data = row.to_dict() + repo_data['categorizations'] = [] + repo_data['final_recommendation'] = 'UNCATEGORIZED' + repo_data['processing_timestamp'] = datetime.datetime.now().isoformat() + repo_data['summary'] = '' + + # Fetch README if needed + if not partial.get('readme_fetched', False): + try: + print(f"Fetching README for {repo_name}...") + readme_content, readme_status = self.fetcher.fetch_readme( + repo_data['repo_artifact_namespace'], + repo_data['repo_artifact_name'] + ) + repo_data['readme_md'] = readme_content + repo_data['readme_status'] = readme_status + + # Update checkpoint + partial['readme_fetched'] = True + partial['readme_status'] = repo_data['readme_status'] + partial_results[repo_id] = partial + checkpoint['partial_results'] = partial_results + self.data_manager.save_checkpoint(checkpoint) + except Exception as e: + print(f"Error fetching README for {repo_name}: {e}") + repo_data['readme_md'] = '' + repo_data['readme_status'] = 'ERROR' + + # Update checkpoint + partial['readme_fetched'] = True + partial['readme_status'] = 'ERROR' + partial_results[repo_id] = partial + checkpoint['partial_results'] = partial_results + self.data_manager.save_checkpoint(checkpoint) + else: + # Use README status from checkpoint + repo_data['readme_status'] = partial.get('readme_status', 'ERROR') + + # Generate summary if needed + if not partial.get('summary_generated', False) and repo_data['readme_status'] == 'SUCCESS': + try: + print(f"Generating summary for {repo_name}...") + readme_content = repo_data.get('readme_md', '') + summary_output: SummaryOutput = self.ai_service.make_summary(readme_content) + repo_data['summary'] = summary_output.summary + + # Update checkpoint + partial['summary_generated'] = True + partial['summary'] = summary_output.summary + partial_results[repo_id] = partial + checkpoint['partial_results'] = partial_results + self.data_manager.save_checkpoint(checkpoint) + except Exception as e: + print(f"Error generating summary for {repo_name}: {e}") + repo_data['summary'] = '' + + # Update checkpoint + partial['summary_generated'] = True # Mark as attempted + partial_results[repo_id] = partial + checkpoint['partial_results'] = partial_results + self.data_manager.save_checkpoint(checkpoint) + elif partial.get('summary_generated', False) and 'summary' in partial: + # Use summary from checkpoint + repo_data['summary'] = partial.get('summary', '') + + # Initialize personas completed + if 'personas_completed' not in partial: + partial['personas_completed'] = [] + + # Initialize categorizations + categorizations = [] + + # Categorize with each persona if README is available + if repo_data['readme_status'] == 'SUCCESS': + for persona in tqdm(personas, desc=f"Categorizing {repo_name} with personas", leave=False): + # Skip if already categorized by this persona + if persona['name'] in partial.get('personas_completed', []): + # Use existing categorization from checkpoint + if 'categorizations' in partial: + for cat in partial['categorizations']: + if cat['persona_name'] == persona['name']: + categorizations.append(cat) + break + continue + + try: + # Prepare project data for categorization + project_data = { + 'summary': repo_data['summary'], + 'repo_artifact_id': repo_id, + 'star_count': repo_data.get('star_count', 0), + 'fork_count': repo_data.get('fork_count', 0), + 'created_at': repo_data.get('created_at'), + 'updated_at': repo_data.get('updated_at') + } + + # Get categorization from this persona + classifications = self.ai_service.classify_projects_batch_for_persona( + [project_data], + persona + ) + + if classifications and len(classifications) > 0: + classification = classifications[0] + cat_entry = { + 'persona_name': persona['name'], + 'category': classification.assigned_tag, + 'reason': classification.reason, + 'timestamp': datetime.datetime.now().isoformat() + } + categorizations.append(cat_entry) + + # Update checkpoint + if 'categorizations' not in partial: + partial['categorizations'] = [] + partial['categorizations'].append(cat_entry) + partial['personas_completed'].append(persona['name']) + partial_results[repo_id] = partial + checkpoint['partial_results'] = partial_results + self.data_manager.save_checkpoint(checkpoint) + else: + cat_entry = { + 'persona_name': persona['name'], + 'category': 'UNCATEGORIZED', + 'reason': 'Failed to get classification from AI service', + 'timestamp': datetime.datetime.now().isoformat() + } + categorizations.append(cat_entry) + + # Update checkpoint + if 'categorizations' not in partial: + partial['categorizations'] = [] + partial['categorizations'].append(cat_entry) + partial['personas_completed'].append(persona['name']) + partial_results[repo_id] = partial + checkpoint['partial_results'] = partial_results + self.data_manager.save_checkpoint(checkpoint) + except Exception as e: + print(f"Error categorizing {repo_name} with persona {persona['name']}: {e}") + cat_entry = { + 'persona_name': persona['name'], + 'category': 'UNCATEGORIZED', + 'reason': f'Error: {str(e)}', + 'timestamp': datetime.datetime.now().isoformat() + } + categorizations.append(cat_entry) + + # Update checkpoint + if 'categorizations' not in partial: + partial['categorizations'] = [] + partial['categorizations'].append(cat_entry) + partial['personas_completed'].append(persona['name']) + partial_results[repo_id] = partial + checkpoint['partial_results'] = partial_results + self.data_manager.save_checkpoint(checkpoint) + + # Add a small delay to avoid rate limiting + time.sleep(0.1) + else: + # If README is empty or error, mark all categorizations as UNCATEGORIZED + for persona in tqdm(personas, desc=f"Marking {repo_name} as UNCATEGORIZED", leave=False): + # Skip if already categorized by this persona + if persona['name'] in partial.get('personas_completed', []): + # Use existing categorization from checkpoint + if 'categorizations' in partial: + for cat in partial['categorizations']: + if cat['persona_name'] == persona['name']: + categorizations.append(cat) + break + continue + + cat_entry = { + 'persona_name': persona['name'], + 'category': 'UNCATEGORIZED', + 'reason': f'README {repo_data["readme_status"]}', + 'timestamp': datetime.datetime.now().isoformat() + } + categorizations.append(cat_entry) + + # Update checkpoint + if 'categorizations' not in partial: + partial['categorizations'] = [] + partial['categorizations'].append(cat_entry) + partial['personas_completed'].append(persona['name']) + partial_results[repo_id] = partial + checkpoint['partial_results'] = partial_results + self.data_manager.save_checkpoint(checkpoint) + + # Determine final recommendation based on categorizations + final_recommendation = self._determine_final_recommendation(categorizations, repo_data.get('star_count', 0)) + + # Update the repository data + repo_data['categorizations'] = categorizations + repo_data['final_recommendation'] = final_recommendation + repo_data['processing_timestamp'] = datetime.datetime.now().isoformat() + + # Create a DataFrame for this repository + repo_df = pd.DataFrame([repo_data]) + + # Save this repository to the unified data + self.data_manager.append_unified_data(repo_df) + + # Mark as fully processed + processed_repos.add(repo_id) + checkpoint['processed_repos'] = list(processed_repos) + checkpoint['last_processed_repo_id'] = repo_id + + # Remove from partial results to save space + if repo_id in partial_results: + del partial_results[repo_id] + + checkpoint['partial_results'] = partial_results + self.data_manager.save_checkpoint(checkpoint) + + # Add to processed data + all_processed_data.append(repo_df) + + if not all_processed_data: + print("No data was processed.") + return pd.DataFrame() + + return pd.concat(all_processed_data, ignore_index=True) if all_processed_data else pd.DataFrame() + + def _determine_final_recommendation(self, categorizations: List[Dict[str, Any]], star_count: int) -> str: + """ + Determine the final recommendation based on categorizations from all personas. + + Args: + categorizations: List of categorization dictionaries + star_count: Star count of the repository (for potential future weighting) + + Returns: + Final category recommendation + """ + # Filter out UNCATEGORIZED entries + valid_categories = [c['category'] for c in categorizations if c['category'] != 'UNCATEGORIZED'] + + if not valid_categories: + return 'UNCATEGORIZED' + + # Count occurrences of each category + category_counts = {} + for category in valid_categories: + category_counts[category] = category_counts.get(category, 0) + 1 + + # Find the most common category + max_count = 0 + final_category = 'UNCATEGORIZED' + + for category, count in category_counts.items(): + if count > max_count: + max_count = count + final_category = category + + return final_category + + +if __name__ == '__main__': + # Example Usage + cfg_manager = ConfigManager() + ai_svc = AIService(config_manager=cfg_manager) + output_dir = cfg_manager.get_output_dir() + dt_manager = DataManager(output_dir=output_dir, config=cfg_manager) + + processor = UnifiedProcessor( + data_manager=dt_manager, + config_manager=cfg_manager, + ai_service=ai_svc + ) + + print("\nRunning UnifiedProcessor...") + processed_data = processor.run( + force_refresh=False, + include_forks=False, + inactive_repos=False + ) + + if not processed_data.empty: + print(f"\nProcessed data head:\n{processed_data.head()}") + print(f"Number of rows processed: {len(processed_data)}") diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/__init__.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/__init__.py new file mode 100644 index 00000000..e1ebbcd8 --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/__init__.py @@ -0,0 +1,10 @@ +from .fetcher import DataFetcher +from .ai_service import AIService + +# The old Summarizer class has been effectively replaced by AIService +# and the pipeline steps (SummaryGeneratorStep, CategorizerStep). + +__all__ = [ + "DataFetcher", + "AIService", +] diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/ai_service.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/ai_service.py new file mode 100644 index 00000000..074767dd --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/ai_service.py @@ -0,0 +1,263 @@ +import json +import pandas as pd +import time +from dataclasses import dataclass, asdict +from typing import List, Type, TypeVar, Union, Dict, Any +import google.generativeai as genai +from ..config.config_manager import ConfigManager + +# Define generic type for output classes +T = TypeVar( + 'T', + bound=Union['SummaryOutput', 'TagsOutput', 'ClassificationOutput', 'BatchClassificationOutput'] +) + +@dataclass +class SummaryOutput: + summary: str + +@dataclass +class TagsOutput: + tags: List[str] + +@dataclass +class ClassificationOutput: + assigned_tag: str + reason: str + +@dataclass +class BatchClassificationOutput: + classifications: List[ClassificationOutput] + + +class AIService: + def __init__(self, config_manager: ConfigManager): + self.config_manager = config_manager + self.api_key = self.config_manager.get_gemini_api_key() # Use specific getter + self.model_name = self.config_manager.get("gemini_model") # Model name can stay in JSON config + + if not self.api_key: + raise ValueError("GEMINI_API_KEY not found in configuration.") + if not self.model_name: + raise ValueError("GEMINI_MODEL not found in configuration.") + + genai.configure(api_key=self.api_key) + self.model = genai.GenerativeModel(self.model_name) + self.request_count = 0 + self.start_time = time.time() + + def _rate_limit_control(self): + """Basic rate limiting: 60 requests per minute for flash models.""" + self.request_count += 1 + elapsed_time = time.time() - self.start_time + if elapsed_time < 60 and self.request_count > 55: # Slight safety margin + sleep_time = 60 - elapsed_time + print(f"Rate limit approaching. Sleeping for {sleep_time:.2f} seconds.") + time.sleep(sleep_time) + self.request_count = 0 + self.start_time = time.time() + elif elapsed_time >= 60: + self.request_count = 0 + self.start_time = time.time() + + + def execute_query(self, prompt: str, output_class: Type[T]) -> T: + """Execute a query against the Gemini API and parse the response.""" + self._rate_limit_control() + print(f"\nSending prompt to Gemini (model: {self.model_name})...") + + try: + response = self.model.generate_content(prompt) + except Exception as e: + print(f"Error calling Gemini API: {e}") + # Fallback for errors + if output_class is SummaryOutput: + return SummaryOutput(summary="Error generating summary.") + if output_class is TagsOutput: + return TagsOutput(tags=[]) + if output_class is ClassificationOutput: + return ClassificationOutput(assigned_tag="Error", reason="API call failed.") + if output_class is BatchClassificationOutput: + return BatchClassificationOutput(classifications=[]) + raise + + try: + text = response.text.strip() + # Try to find JSON block, robustly + json_str = None + if output_class is BatchClassificationOutput: # Expects a list + start_brace = text.find("[") + end_brace = text.rfind("]") + 1 # Add 1 to include the closing bracket + else: # Expects an object + start_brace = text.find("{") + end_brace = text.rfind("}") + 1 # Add 1 to include the closing brace + + if start_brace != -1 and end_brace > start_brace: + json_str = text[start_brace:end_brace] + data = json.loads(json_str) + else: + print("No valid JSON found in response.") + raise ValueError("No JSON object/array found in response") + + if output_class is SummaryOutput: + return SummaryOutput(summary=data.get("summary", "Summary not found in response.")) + if output_class is TagsOutput: + return TagsOutput(tags=data.get("tags", [])) + if output_class is ClassificationOutput: # For single classification + return ClassificationOutput( + assigned_tag=data.get("assigned_tag", "Other"), + reason=data.get("reason", "Could not classify project from response.") + ) + if output_class is BatchClassificationOutput: # For batch classification + classifications_data = data # data is already the list + parsed_classifications = [ + ClassificationOutput( + assigned_tag=item.get("assigned_tag", "Other"), + reason=item.get("reason", "Could not classify.") + ) for item in classifications_data + ] + return BatchClassificationOutput(classifications=parsed_classifications) + + raise ValueError(f"Unknown output class: {output_class}") + + except (json.JSONDecodeError, ValueError) as e: + print(f"Error processing Gemini response: {e}. Raw text: '{response.text[:300]}...'") + if output_class is SummaryOutput: + return SummaryOutput(summary="Failed to parse summary from response.") + if output_class is TagsOutput: + return TagsOutput(tags=[]) + if output_class is ClassificationOutput: + return ClassificationOutput(assigned_tag="Other", reason="Failed to parse classification.") + if output_class is BatchClassificationOutput: + # Return empty list of classifications for the batch + return BatchClassificationOutput(classifications=[]) + raise + + def make_summary(self, readme_md: str) -> SummaryOutput: + """Generate a summary of the project based on its README.""" + if not readme_md or not readme_md.strip(): + return SummaryOutput(summary="This appears to be an empty repository without a README file.") + + prompt_template = self.config_manager.get_summary_prompt_template() + prompt = prompt_template.format(readme_md=readme_md) + return self.execute_query(prompt, SummaryOutput) + + def make_tags(self, summary: str) -> TagsOutput: + """Generate tags for the project based on its summary.""" + if not summary or "empty repository" in summary.lower() or "error generating summary" in summary.lower(): + return TagsOutput(tags=[]) + + prompt_template = self.config_manager.get_tags_prompt_template() + prompt = prompt_template.format(summary=summary) + return self.execute_query(prompt, TagsOutput) + + def classify_projects_batch_for_persona( + self, + project_data_batch: List[Dict[str, Any]], # Changed from summaries: List[str] + persona: Dict[str, Any] + ) -> List[ClassificationOutput]: + """ + Classify multiple projects at once for a specific persona using their summaries and metadata. + Each item in project_data_batch is a dict with 'summary', 'star_count', etc. + The persona dictionary should contain 'name', 'title', 'description', and 'prompt' (template). + """ + if not project_data_batch: + return [] + + categories_list_str = "\n".join( + f"- \"{c['category']}\": {c['description']}" # Ensure category names are quoted for clarity in prompt + for c in self.config_manager.get_categories() + ) + + persona_prompt_template = persona.get('prompt') + if not persona_prompt_template: + print(f"Error: Persona '{persona.get('name')}' is missing a prompt template.") + return [ClassificationOutput(assigned_tag="Error", reason="Persona prompt missing")] * len(project_data_batch) + + individual_project_prompts = [] + for i, project_data in enumerate(project_data_batch): + # Prepare metadata for formatting, handling None or NaN + # Ensure star_count and fork_count are numbers, default to 0 if None/NaN + star_count = project_data.get('star_count') + fork_count = project_data.get('fork_count') + + formatted_star_count = int(star_count) if pd.notna(star_count) else 0 + formatted_fork_count = int(fork_count) if pd.notna(fork_count) else 0 + + # Format dates, default to "N/A" if None/NaT + created_at = project_data.get('created_at') + updated_at = project_data.get('updated_at') + + formatted_created_at = str(created_at.date()) if pd.notna(created_at) and hasattr(created_at, 'date') else "N/A" + formatted_updated_at = str(updated_at.date()) if pd.notna(updated_at) and hasattr(updated_at, 'date') else "N/A" + + # Ensure summary is a string + summary_text = project_data.get('summary', "No summary provided.") + if not isinstance(summary_text, str): + summary_text = str(summary_text) + + + try: + # The persona_prompt_template itself contains the persona's role description. + # We just need to format it with the project-specific data. + # The {categories} placeholder in the persona prompt will be filled by this categories_list_str. + formatted_project_section = persona_prompt_template.format( + summary=summary_text, + star_count=formatted_star_count, + fork_count=formatted_fork_count, + created_at=formatted_created_at, + updated_at=formatted_updated_at, + categories=categories_list_str # Pass the formatted list of categories + ) + individual_project_prompts.append(f"--- Project {i+1} ---\n{formatted_project_section}") + except KeyError as e: + print(f"KeyError during prompt formatting for persona {persona.get('name')}, project {project_data.get('repo_artifact_id', 'Unknown')}: {e}") + # Add a placeholder error entry for this project + individual_project_prompts.append(f"--- Project {i+1} ---\nError formatting prompt for this project. Cannot classify.") + + + batch_project_details_str = "\n\n".join(individual_project_prompts) + + # Construct the overall batch prompt + # The persona's title and description can frame the overall task. + persona_title = persona.get('title', persona['name']) + persona_description = persona.get('description', '') + + final_batch_prompt = f"""As {persona_title} ({persona_description}), your task is to review and classify the following {len(project_data_batch)} project(s). +For each project, use the specific instructions and context provided under its section. + +{batch_project_details_str} + +After reviewing all projects, please respond with a single JSON array. Each element in the array should be a JSON object corresponding to one project, in the exact order they were presented above. Each object must contain: +1. "assigned_tag": The category you assigned from the provided list. +2. "reason": A brief explanation for your choice, following the persona's specific instructions. + +Example for two projects: +[ + {{ "assigned_tag": "Category A", "reason": "Reason for project 1..." }}, + {{ "assigned_tag": "Category B", "reason": "Reason for project 2..." }} +] +""" + + batch_output = self.execute_query(final_batch_prompt, BatchClassificationOutput) + + # Ensure the number of classifications matches the number of projects + if len(batch_output.classifications) != len(project_data_batch): + print(f"Warning: Mismatch in number of projects ({len(project_data_batch)}) and classifications ({len(batch_output.classifications)}) for persona {persona['name']}.") + error_classification = ClassificationOutput(assigned_tag="Error", reason="Mismatch in batch processing output length") + # Adjust the length of classifications to match project_data_batch + final_classifications = batch_output.classifications[:len(project_data_batch)] + while len(final_classifications) < len(project_data_batch): + final_classifications.append(error_classification) + batch_output.classifications = final_classifications + + return batch_output.classifications + + +if __name__ == '__main__': + # Example Usage + # Example Usage: + # cfg_manager = ConfigManager() + # ai_service = AIService(config_manager=cfg_manager) + # print("AIService initialized for standalone testing if needed.") + pass diff --git a/experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/fetcher.py b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/fetcher.py new file mode 100644 index 00000000..36bf25ac --- /dev/null +++ b/experiments/ethereum-repo-clusters/ethereum-repo-clusters/processing/fetcher.py @@ -0,0 +1,140 @@ +import base64 +import requests +import pandas as pd +import datetime +from pyoso import Client +from ..config.settings import OSO_API_KEY, GITHUB_HEADERS + +class DataFetcher: + def __init__(self): + self.oso_client = Client(api_key=OSO_API_KEY) + + def fetch_repositories(self, limit: int = None, sort_by_stars: bool = True) -> pd.DataFrame: + """ + Fetch repositories from OSO. + + Args: + limit: Optional limit on number of repositories to fetch. + sort_by_stars: If True, sort repositories by star_count descending. + """ + + where_keywords = """ + collection_name LIKE '%ethereum%' + OR collection_name LIKE '%arbitrum%' + OR collection_name LIKE '%optimism%' + OR collection_name LIKE '%scroll%' + OR collection_name LIKE '%polygon%' + """ + query = f""" + SELECT DISTINCT + re.artifact_id AS repo_artifact_id, + p.project_id, + p.project_name, + p.display_name, + re.artifact_namespace AS repo_artifact_namespace, + re.artifact_name AS repo_artifact_name, + re.created_at, + re.updated_at, + re.star_count, + re.fork_count, + re.is_fork, + re.num_packages_in_deps_dev + FROM int_repositories_enriched AS re + JOIN projects_v1 AS p ON re.project_id = p.project_id + WHERE p.project_id IN ( + SELECT DISTINCT project_id FROM oso.projects_by_collection_v1 + WHERE {where_keywords} + ) + """ + # The table int_superchain_s7_devtooling_repositories should have star_count + # If not, this sort will fail or do nothing. Assuming 'r.star_count' is valid. + if sort_by_stars: + query += " ORDER BY re.star_count DESC, p.project_name ASC" + + if limit is not None and isinstance(limit, int) and limit > 0: + query += f" LIMIT {limit}" + + df = self.oso_client.to_pandas(query) + + # Add is_actively_maintained field based on updated_at (active if updated in last year) + # Use naive datetime (no timezone) for comparison + one_year_ago = pd.Timestamp.now().tz_localize(None) - pd.Timedelta(days=365) + + # Convert updated_at to datetime if it's a string + def check_if_active(date): + if pd.isna(date): + return False + + # Convert to datetime if it's a string + if isinstance(date, str): + try: + date = pd.to_datetime(date) + except: + return False + + # Ensure datetime is naive (no timezone) for comparison + if hasattr(date, 'tz_localize'): + if date.tzinfo is not None: + date = date.tz_localize(None) + + # Now compare with one_year_ago + return date > one_year_ago + + df['is_actively_maintained'] = df['updated_at'].apply(check_if_active) + + # Ensure is_fork is a boolean + if 'is_fork' not in df.columns: + print("Warning: 'is_fork' field not available in OSO data. Setting all to False.") + df['is_fork'] = False + else: + # Convert to boolean if it's not already + df['is_fork'] = df['is_fork'].fillna(False).astype(bool) + + return df + + def fetch_readme(self, owner: str, repo: str) -> tuple: + """ + Fetch README.md content from GitHub repository with debug logging. + + Returns: + tuple: (readme_content, status) where status is one of: + "SUCCESS", "EMPTY", or "ERROR" + """ + url = f"https://api.github.com/repos/{owner}/{repo}/readme" + print(f"Fetching README for {owner}/{repo} ...", flush=True) + resp = requests.get(url, headers=GITHUB_HEADERS) + print(f"Status code: {resp.status_code}", flush=True) + if resp.status_code == 200: + data = resp.json() + try: + content = base64.b64decode(data["content"]).decode("utf-8") + if not content.strip(): + print(f"Empty README for {owner}/{repo}", flush=True) + return "", "EMPTY" + print(f"Successfully fetched README for {owner}/{repo}", flush=True) + return content, "SUCCESS" + except Exception as e: + print(f"Error decoding README for {owner}/{repo}: {e}", flush=True) + return "", "ERROR" + else: + print(f"Failed to fetch README for {owner}/{repo}: {resp.text}", flush=True) + return "", "ERROR" + + def get_all_readmes(self, df: pd.DataFrame) -> pd.DataFrame: + """Add README content to the dataframe for each repository with debug logging.""" + print("First 5 repo_artifact_namespace:", df["repo_artifact_namespace"].head().tolist(), flush=True) + print("First 5 repo_artifact_name:", df["repo_artifact_name"].head().tolist(), flush=True) + + # Apply fetch_readme and capture both content and status with progress bar + from tqdm import tqdm + tqdm.pandas(desc="Fetching READMEs") + readme_results = df.progress_apply( + lambda row: self.fetch_readme(row.repo_artifact_namespace, row.repo_artifact_name), + axis=1 + ) + + # Split the results into separate columns + df["readme_md"] = [result[0] for result in readme_results] + df["readme_status"] = [result[1] for result in readme_results] + + return df diff --git a/experiments/ethereum-repo-clusters/pipeline_config.json b/experiments/ethereum-repo-clusters/pipeline_config.json new file mode 100644 index 00000000..65dc68d3 --- /dev/null +++ b/experiments/ethereum-repo-clusters/pipeline_config.json @@ -0,0 +1,10 @@ +{ + "output_dir": "/Users/cerv1/Dropbox/Kariba/Github/insights/experiments/devtooling_labels/output", + "gemini_model": "gemini-2.0-flash", + "summary_prompt_template": "You are an analyst preparing short, neutral briefs on open-source projects. Read the README below and write a **concise, 2- to 3-sentence summary** that:\n\u2022 states the project\u2019s core purpose / problem it solves\n\u2022 lists its main capabilities or components (1\u20133 key points only)\n\u2022 mentions the primary intended users or systems (e.g., smart-contract developers, node operators)\n\u2022 notes any strongly signalled context such as supported programming language, network, or runtime\n\n**Style constraints**\n\u2022 Use plain, factual language in third person (no hype, no marketing adjectives).\n\u2022 **Do not** guess or invent details that are not explicit in the README.\n\u2022 **Do not** label the project with, or copy wording from, the taxonomy below (to avoid category leakage).\n\u2022 Limit the summary to <100 words; avoid bullet lists or line breaks.\n\nReturn your answer as **exactly one valid JSON object** in this form (nothing extra):\n{{\n \"summary\": \"your summary here\"\n}}\n\nREADME:\n{readme_md}", + "tags_prompt_template": "Based on this project summary, generate a list of relevant tags that describe the project's purpose and functionality.\n\nYou must respond with a valid JSON object in this exact format:\n{{\n \"tags\": [\"tag1\", \"tag2\", \"tag3\"]\n}}\n\nSummary:\n{summary}", + "test_mode": false, + "test_mode_limit": 30, + "batch_size_summaries": 10, + "batch_size_categorization": 10 +} \ No newline at end of file diff --git a/experiments/ethereum-repo-clusters/requirements.txt b/experiments/ethereum-repo-clusters/requirements.txt new file mode 100644 index 00000000..7ab7b91e --- /dev/null +++ b/experiments/ethereum-repo-clusters/requirements.txt @@ -0,0 +1,8 @@ +pandas>=2.0.0 +requests>=2.31.0 +pyoso>=0.1.0 +google-generativeai>=0.3.0 +pydantic>=2.0.0 +python-dotenv>=1.0.0 +click>=8.0.0 +pyarrow>=14.0.0 # For parquet support diff --git a/experiments/ethereum-repo-clusters/setup.py b/experiments/ethereum-repo-clusters/setup.py new file mode 100644 index 00000000..1b43d1c3 --- /dev/null +++ b/experiments/ethereum-repo-clusters/setup.py @@ -0,0 +1,16 @@ +from setuptools import setup, find_packages + +setup( + name="devtooling_labels", + version="0.1.0", + packages=find_packages(), + install_requires=[ + "pandas>=2.0.0", + "requests>=2.31.0", + "pyoso>=0.1.0", + "google-generativeai>=0.3.0", + "pydantic>=2.0.0", + "python-dotenv>=1.0.0", + ], + python_requires=">=3.8", +) \ No newline at end of file diff --git a/tutorials/FundingMetrics.ipynb b/tutorials/FundingMetrics.ipynb new file mode 100644 index 00000000..d64c4876 --- /dev/null +++ b/tutorials/FundingMetrics.ipynb @@ -0,0 +1,1192 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "41b714c9-c749-4d0d-ad59-71e0c035d325", + "metadata": {}, + "outputs": [], + "source": [ + "# ! pip install pyoso" + ] + }, + { + "cell_type": "markdown", + "id": "413d143d-4494-4812-8cae-d28f47cc397e", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "Load environment variables, import necessary libraries, and initialize the OSO client" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "988cd219-8b29-469d-9e7a-46f7a965ddc7", + "metadata": {}, + "outputs": [], + "source": [ + "from dotenv import load_dotenv\n", + "import os\n", + "import pandas as pd\n", + "from pyoso import Client\n", + "\n", + "load_dotenv()\n", + "\n", + "OSO_API_KEY = os.environ['OSO_API_KEY']\n", + "client = Client(api_key=OSO_API_KEY)" + ] + }, + { + "cell_type": "markdown", + "id": "4231b926-89ab-48de-8746-b0d10f44c470", + "metadata": {}, + "source": [ + "## Testing\n", + "\n", + "Query the metrics table for all metric names containing '_funding_' and display them in alphabetical order" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "77fe6763-7d49-47a5-b5c6-68264034ab0c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
metric_name
0GITCOIN_DONATIONS_funding_awarded_biannually
1GITCOIN_DONATIONS_funding_awarded_daily
2GITCOIN_DONATIONS_funding_awarded_monthly
3GITCOIN_DONATIONS_funding_awarded_over_all_time
4GITCOIN_DONATIONS_funding_awarded_quarterly
5GITCOIN_DONATIONS_funding_awarded_weekly
6GITCOIN_DONATIONS_funding_awarded_yearly
7GITCOIN_MATCHING_funding_awarded_biannually
8GITCOIN_MATCHING_funding_awarded_daily
9GITCOIN_MATCHING_funding_awarded_monthly
10GITCOIN_MATCHING_funding_awarded_over_all_time
11GITCOIN_MATCHING_funding_awarded_quarterly
12GITCOIN_MATCHING_funding_awarded_weekly
13GITCOIN_MATCHING_funding_awarded_yearly
14OPEN_COLLECTIVE_funding_received_biannually
15OPEN_COLLECTIVE_funding_received_daily
16OPEN_COLLECTIVE_funding_received_monthly
17OPEN_COLLECTIVE_funding_received_over_all_time
18OPEN_COLLECTIVE_funding_received_quarterly
19OPEN_COLLECTIVE_funding_received_weekly
20OPEN_COLLECTIVE_funding_received_yearly
21OSS_FUNDING_funding_awarded_biannually
22OSS_FUNDING_funding_awarded_daily
23OSS_FUNDING_funding_awarded_monthly
24OSS_FUNDING_funding_awarded_over_all_time
25OSS_FUNDING_funding_awarded_quarterly
26OSS_FUNDING_funding_awarded_weekly
27OSS_FUNDING_funding_awarded_yearly
\n", + "
" + ], + "text/plain": [ + " metric_name\n", + "0 GITCOIN_DONATIONS_funding_awarded_biannually\n", + "1 GITCOIN_DONATIONS_funding_awarded_daily\n", + "2 GITCOIN_DONATIONS_funding_awarded_monthly\n", + "3 GITCOIN_DONATIONS_funding_awarded_over_all_time\n", + "4 GITCOIN_DONATIONS_funding_awarded_quarterly\n", + "5 GITCOIN_DONATIONS_funding_awarded_weekly\n", + "6 GITCOIN_DONATIONS_funding_awarded_yearly\n", + "7 GITCOIN_MATCHING_funding_awarded_biannually\n", + "8 GITCOIN_MATCHING_funding_awarded_daily\n", + "9 GITCOIN_MATCHING_funding_awarded_monthly\n", + "10 GITCOIN_MATCHING_funding_awarded_over_all_time\n", + "11 GITCOIN_MATCHING_funding_awarded_quarterly\n", + "12 GITCOIN_MATCHING_funding_awarded_weekly\n", + "13 GITCOIN_MATCHING_funding_awarded_yearly\n", + "14 OPEN_COLLECTIVE_funding_received_biannually\n", + "15 OPEN_COLLECTIVE_funding_received_daily\n", + "16 OPEN_COLLECTIVE_funding_received_monthly\n", + "17 OPEN_COLLECTIVE_funding_received_over_all_time\n", + "18 OPEN_COLLECTIVE_funding_received_quarterly\n", + "19 OPEN_COLLECTIVE_funding_received_weekly\n", + "20 OPEN_COLLECTIVE_funding_received_yearly\n", + "21 OSS_FUNDING_funding_awarded_biannually\n", + "22 OSS_FUNDING_funding_awarded_daily\n", + "23 OSS_FUNDING_funding_awarded_monthly\n", + "24 OSS_FUNDING_funding_awarded_over_all_time\n", + "25 OSS_FUNDING_funding_awarded_quarterly\n", + "26 OSS_FUNDING_funding_awarded_weekly\n", + "27 OSS_FUNDING_funding_awarded_yearly" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client.to_pandas(\"\"\"\n", + "SELECT metric_name\n", + "FROM metrics_v0\n", + "WHERE metric_name LIKE '%_funding_%'\n", + "ORDER BY 1\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "id": "e85f23d4-3605-40f6-8228-ce175de20f30", + "metadata": {}, + "source": [ + "## Aggregate funding metrics\n", + "\n", + "### By source\n", + "\n", + "We currently support CSV data uploads via [oss-funding](https://github.com/opensource-observer/oss-funding) and Gitcoin Grants. We also have Open Collective deposits, but they don't show up here (yet)." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "aa31b0d1-2117-4fb0-8b6e-f206500602d2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
metric_nametotal_amount_in_usd
0OSS_FUNDING_funding_awarded_over_all_time364887873.600968
1GITCOIN_MATCHING_funding_awarded_over_all_time13305117.158144
2GITCOIN_DONATIONS_funding_awarded_over_all_time11666103.711711
\n", + "
" + ], + "text/plain": [ + " metric_name total_amount_in_usd\n", + "0 OSS_FUNDING_funding_awarded_over_all_time 364887873.600968\n", + "1 GITCOIN_MATCHING_funding_awarded_over_all_time 13305117.158144\n", + "2 GITCOIN_DONATIONS_funding_awarded_over_all_time 11666103.711711" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client.to_pandas(\"\"\"\n", + "SELECT\n", + " m.metric_name,\n", + " SUM(km.amount) AS total_amount_in_usd\n", + "FROM key_metrics_by_project_v0 AS km\n", + "JOIN metrics_v0 AS m ON km.metric_id = m.metric_id\n", + "WHERE m.metric_name LIKE '%_funding_%'\n", + "GROUP BY 1\n", + "ORDER BY 2 DESC\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "id": "0d2902b8-468b-44ae-bae9-d797ec18de0d", + "metadata": {}, + "source": [ + "### To projects\n", + "\n", + "We can also see the largest project recipients with this query." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c9edc7a4-19d1-443c-bc01-ced2c6ffda65", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
project_display_nametotal_amount_in_usd
0GMX21000000.0
1MUX Protocol10876479.0
2Synthetix10022628.074157
3Perpetual Protocol9287212.140718
4Gains Network7898396.135
5Velodrome7895037.76024
6Camelot5407500.0
7Stargate Finance5289458.865658
8Vertex Protocol5250000.0
9Radiant4991077.0
\n", + "
" + ], + "text/plain": [ + " project_display_name total_amount_in_usd\n", + "0 GMX 21000000.0\n", + "1 MUX Protocol 10876479.0\n", + "2 Synthetix 10022628.074157\n", + "3 Perpetual Protocol 9287212.140718\n", + "4 Gains Network 7898396.135\n", + "5 Velodrome 7895037.76024\n", + "6 Camelot 5407500.0\n", + "7 Stargate Finance 5289458.865658\n", + "8 Vertex Protocol 5250000.0\n", + "9 Radiant 4991077.0" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client.to_pandas(\"\"\"\n", + "SELECT\n", + " p.display_name AS project_display_name,\n", + " SUM(km.amount) AS total_amount_in_usd\n", + "FROM key_metrics_by_project_v0 AS km\n", + "JOIN metrics_v0 AS m ON km.metric_id = m.metric_id\n", + "JOIN projects_v1 AS p ON km.project_id = p.project_id\n", + "WHERE m.metric_name LIKE '%_funding_awarded_over_all_time'\n", + "GROUP BY 1\n", + "ORDER BY 2 DESC\n", + "LIMIT 10\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "id": "0841ca11-505d-47bc-acdc-5ca8314af991", + "metadata": {}, + "source": [ + "### To projects from a specific source" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "9783c7e0-c2e0-4f1c-9dda-42d458cb39ce", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
project_display_nametotal_amount_in_usd
0Gitcoin1099895.038376
1Revoke748859.365745
2DefiLlama429924.507285
3Hey360529.24178
4JediSwap333277.670918
5Dark Forest332205.420888
6ZigZag Exchange210175.931949
7ethers.js190702.539836
8rotki174990.340416
9Taho170854.869607
\n", + "
" + ], + "text/plain": [ + " project_display_name total_amount_in_usd\n", + "0 Gitcoin 1099895.038376\n", + "1 Revoke 748859.365745\n", + "2 DefiLlama 429924.507285\n", + "3 Hey 360529.24178\n", + "4 JediSwap 333277.670918\n", + "5 Dark Forest 332205.420888\n", + "6 ZigZag Exchange 210175.931949\n", + "7 ethers.js 190702.539836\n", + "8 rotki 174990.340416\n", + "9 Taho 170854.869607" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client.to_pandas(\"\"\"\n", + "SELECT\n", + " p.display_name AS project_display_name,\n", + " SUM(km.amount) AS total_amount_in_usd\n", + "FROM key_metrics_by_project_v0 AS km\n", + "JOIN metrics_v0 AS m ON km.metric_id = m.metric_id\n", + "JOIN projects_v1 AS p ON km.project_id = p.project_id\n", + "WHERE m.metric_name = 'GITCOIN_DONATIONS_funding_awarded_over_all_time'\n", + "GROUP BY 1\n", + "ORDER BY 2 DESC\n", + "LIMIT 10\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "id": "807ac8df-5c90-4c14-821b-d29ebab1089c", + "metadata": {}, + "source": [ + "### To projects from a specific source and time frame" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "69f8d8ff-f902-4cce-bc3f-d23182f84de3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
project_display_nametotal_amount_in_usd
0Gitcoin797206.330376
1Dark Forest297517.115925
2ZigZag Exchange199746.433382
3ethers.js129500.966707
4Prysm Ethereum Client128522.705766
5rotki122666.927997
6ZeroPool116795.642612
7Lighthouse by Sigma Prime114759.839844
8The Tor Project110669.738113
9Hardhat110539.758225
\n", + "
" + ], + "text/plain": [ + " project_display_name total_amount_in_usd\n", + "0 Gitcoin 797206.330376\n", + "1 Dark Forest 297517.115925\n", + "2 ZigZag Exchange 199746.433382\n", + "3 ethers.js 129500.966707\n", + "4 Prysm Ethereum Client 128522.705766\n", + "5 rotki 122666.927997\n", + "6 ZeroPool 116795.642612\n", + "7 Lighthouse by Sigma Prime 114759.839844\n", + "8 The Tor Project 110669.738113\n", + "9 Hardhat 110539.758225" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client.to_pandas(\"\"\"\n", + "SELECT\n", + " p.display_name AS project_display_name,\n", + " SUM(tm.amount) AS total_amount_in_usd\n", + "FROM timeseries_metrics_by_project_v0 AS tm\n", + "JOIN metrics_v0 AS m ON tm.metric_id = m.metric_id\n", + "JOIN projects_v1 AS p ON tm.project_id = p.project_id\n", + "WHERE m.metric_name = 'GITCOIN_DONATIONS_funding_awarded_yearly'\n", + "AND tm.sample_date < DATE '2022-01-01'\n", + "GROUP BY 1\n", + "ORDER BY 2 DESC\n", + "LIMIT 10\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "id": "fe5ad20c-7772-4e94-83ec-77baea215d00", + "metadata": {}, + "source": [ + "## More granular analysis\n", + "\n", + "### Gitcoin\n", + "\n", + "Deep dive on Gitcoin grants to a specific project" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "60aa05c7-098f-4e6f-8183-a01fc5ed9652", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timeround_numberround_nameevent_sourcedonor_addressamount_in_usd
02023-02-09 15:58:07.078<NA>Gitcoin GrantsGITCOIN_DONATIONS0x386ea3171dcc9405311fd75b316cc2a87ecadeca617893.575
12023-08-29 08:32:57.00018Web3 Open Source SoftwareGITCOIN_MATCHING<NA>15001.1375
22024-11-25 14:26:59.00022GG22 OSS - dApps and AppsGITCOIN_MATCHING<NA>14984.28125
32024-05-31 14:35:02.00020dApps & AppsGITCOIN_MATCHING<NA>14979.978125
42023-11-29 20:18:47.00019Web3 Open Source SoftwareGITCOIN_MATCHING<NA>14849.591509
52022-08-24 00:00:00.00015<NA>GITCOIN_MATCHING<NA>12500.0
62024-08-26 15:19:00.00021GG21: Thriving Arbitrum SummerGITCOIN_MATCHING<NA>9839.680095
72022-08-24 00:00:00.00015<NA>GITCOIN_MATCHING<NA>7410.488854
82024-05-07 10:04:49.00020dApps & AppsGITCOIN_DONATIONS0xe2a26d5174b133abc4b338df1b07295f03a4c85e1000.42865
92024-05-06 17:29:47.00020dApps & AppsGITCOIN_DONATIONS0x60a06b2eee871e349331143ef173ecefd7a8ce01537.338562
\n", + "
" + ], + "text/plain": [ + " time round_number round_name \\\n", + "0 2023-02-09 15:58:07.078 Gitcoin Grants \n", + "1 2023-08-29 08:32:57.000 18 Web3 Open Source Software \n", + "2 2024-11-25 14:26:59.000 22 GG22 OSS - dApps and Apps \n", + "3 2024-05-31 14:35:02.000 20 dApps & Apps \n", + "4 2023-11-29 20:18:47.000 19 Web3 Open Source Software \n", + "5 2022-08-24 00:00:00.000 15 \n", + "6 2024-08-26 15:19:00.000 21 GG21: Thriving Arbitrum Summer \n", + "7 2022-08-24 00:00:00.000 15 \n", + "8 2024-05-07 10:04:49.000 20 dApps & Apps \n", + "9 2024-05-06 17:29:47.000 20 dApps & Apps \n", + "\n", + " event_source donor_address \\\n", + "0 GITCOIN_DONATIONS 0x386ea3171dcc9405311fd75b316cc2a87ecadeca \n", + "1 GITCOIN_MATCHING \n", + "2 GITCOIN_MATCHING \n", + "3 GITCOIN_MATCHING \n", + "4 GITCOIN_MATCHING \n", + "5 GITCOIN_MATCHING \n", + "6 GITCOIN_MATCHING \n", + "7 GITCOIN_MATCHING \n", + "8 GITCOIN_DONATIONS 0xe2a26d5174b133abc4b338df1b07295f03a4c85e \n", + "9 GITCOIN_DONATIONS 0x60a06b2eee871e349331143ef173ecefd7a8ce01 \n", + "\n", + " amount_in_usd \n", + "0 617893.575 \n", + "1 15001.1375 \n", + "2 14984.28125 \n", + "3 14979.978125 \n", + "4 14849.591509 \n", + "5 12500.0 \n", + "6 9839.680095 \n", + "7 7410.488854 \n", + "8 1000.42865 \n", + "9 537.338562 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client.to_pandas(\"\"\"\n", + " SELECT\n", + " time,\n", + " round_number,\n", + " round_name,\n", + " event_source,\n", + " donor_address,\n", + " amount_in_usd\n", + " FROM int_events__gitcoin_funding\n", + " WHERE gitcoin_group_project_name = 'revokecash'\n", + " ORDER BY amount_in_usd DESC\n", + " LIMIT 10\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "id": "144e833e-cf24-4659-a43b-35505d95501a", + "metadata": {}, + "source": [ + "## OSS Funding\n", + "\n", + "Overview of specific funders and grant pools in oss-funding data" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "fa50c103-7c3c-4fb2-bcb1-ff754a8b53f6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
from_funder_namegrant_poolsamount_in_usd
0optimism12240450291.744
1arbitrumfoundation1122850952.0
2stellar2932989032.98
3octant-golemfoundation53965429.51329
4dao-drops-dorgtech1250001.0
5clrfund183028.740386
\n", + "
" + ], + "text/plain": [ + " from_funder_name grant_pools amount_in_usd\n", + "0 optimism 12 240450291.744\n", + "1 arbitrumfoundation 1 122850952.0\n", + "2 stellar 29 32989032.98\n", + "3 octant-golemfoundation 5 3965429.51329\n", + "4 dao-drops-dorgtech 1 250001.0\n", + "5 clrfund 1 83028.740386" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client.to_pandas(\"\"\"\n", + "SELECT\n", + " from_funder_name,\n", + " COUNT(DISTINCT grant_pool_name) AS grant_pools,\n", + " SUM(amount) AS amount_in_usd\n", + "FROM stg_ossd__current_funding\n", + "GROUP BY 1\n", + "ORDER BY 3 DESC\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "id": "f7aaf46c-45b8-444e-a6f1-27286085a463", + "metadata": {}, + "source": [ + "## Funding flows\n", + "\n", + "We can use this to construct a simple sankey diagram of funding flows" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "1bd3a7a8-8cac-420e-9e7b-5259ff0e3b2d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
from_project_idto_project_idfunderprojectamount
2142Qgbm336fY9862LN2Czg3UX04A3p7I/79Bv2M4D61DAI=8IKXraxq1pDuQD1xaDI20cjFrel55TZ/zf6LmP69qEg=Gitcoinefdevcon13.531599
2143Qgbm336fY9862LN2Czg3UX04A3p7I/79Bv2M4D61DAI=79HQoZtyZftibazh6Yz63aU06XODWs7b/9h4JAqPa1s=GitcoinLexDAO86499.728685
21445Fgf9xv3CxTV+YbSShdY9XCJs7tgW8KNwQWq9rHUEsQ=79HQoZtyZftibazh6Yz63aU06XODWs7b/9h4JAqPa1s=clr.fundLexDAO193.952856
2145Qgbm336fY9862LN2Czg3UX04A3p7I/79Bv2M4D61DAI=yEebFy4M1iAdb9+YQmdssSx9Qf+ZXfSVguL/JyidngI=GitcoinDeFiEye224058.115245
21465Fgf9xv3CxTV+YbSShdY9XCJs7tgW8KNwQWq9rHUEsQ=JQtLQErRk0u41xS292Cg+s3cRr8LaD5lQ2kME/Syp2Q=clr.fundAsilo Digital703.639308
\n", + "
" + ], + "text/plain": [ + " from_project_id \\\n", + "2142 Qgbm336fY9862LN2Czg3UX04A3p7I/79Bv2M4D61DAI= \n", + "2143 Qgbm336fY9862LN2Czg3UX04A3p7I/79Bv2M4D61DAI= \n", + "2144 5Fgf9xv3CxTV+YbSShdY9XCJs7tgW8KNwQWq9rHUEsQ= \n", + "2145 Qgbm336fY9862LN2Czg3UX04A3p7I/79Bv2M4D61DAI= \n", + "2146 5Fgf9xv3CxTV+YbSShdY9XCJs7tgW8KNwQWq9rHUEsQ= \n", + "\n", + " to_project_id funder project \\\n", + "2142 8IKXraxq1pDuQD1xaDI20cjFrel55TZ/zf6LmP69qEg= Gitcoin efdevcon \n", + "2143 79HQoZtyZftibazh6Yz63aU06XODWs7b/9h4JAqPa1s= Gitcoin LexDAO \n", + "2144 79HQoZtyZftibazh6Yz63aU06XODWs7b/9h4JAqPa1s= clr.fund LexDAO \n", + "2145 yEebFy4M1iAdb9+YQmdssSx9Qf+ZXfSVguL/JyidngI= Gitcoin DeFiEye \n", + "2146 JQtLQErRk0u41xS292Cg+s3cRr8LaD5lQ2kME/Syp2Q= clr.fund Asilo Digital \n", + "\n", + " amount \n", + "2142 13.531599 \n", + "2143 86499.728685 \n", + "2144 193.952856 \n", + "2145 224058.115245 \n", + "2146 703.639308 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query = \"\"\"\n", + "SELECT\n", + " fp.project_id AS from_project_id,\n", + " tp.project_id AS to_project_id,\n", + " fp.display_name AS funder,\n", + " tp.display_name AS project,\n", + " SUM(e.amount) AS amount\n", + "FROM int_events_daily__funding AS e\n", + "JOIN artifacts_by_project_v1 AS fa\n", + " ON e.from_artifact_id = fa.artifact_id\n", + "JOIN artifacts_by_project_v1 AS ta\n", + " ON e.to_artifact_id = ta.artifact_id\n", + "JOIN projects_v1 AS fp\n", + " ON fa.project_id = fp.project_id\n", + "JOIN projects_v1 AS tp\n", + " ON ta.project_id = tp.project_id\n", + "GROUP BY 1,2,3,4\n", + "\"\"\"\n", + "df = client.to_pandas(query)\n", + "df.tail()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}