diff --git a/.gitignore b/.gitignore index 384bad5..c85f5fa 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,6 @@ node_modules llm-server/.venv3 venv3/ llm-server\venv3 +__pycache__/ +*.pyc +*.pyo diff --git a/llm-server/__pycache__/app.cpython-312.pyc b/llm-server/__pycache__/app.cpython-312.pyc deleted file mode 100644 index 39d3d0b..0000000 Binary files a/llm-server/__pycache__/app.cpython-312.pyc and /dev/null differ diff --git a/llm-server/app.py b/llm-server/app.py index 5fe2257..35d83e0 100644 --- a/llm-server/app.py +++ b/llm-server/app.py @@ -51,12 +51,6 @@ def format(self, record): "https://www.gitforme.tech" ] CORS(app, origins=allowed_origins, supports_credentials=True) -@app.after_request -def apply_cors(response): - response.headers["Access-Control-Allow-Origin"] = "https://www.gitforme.tech" - response.headers["Access-Control-Allow-Headers"] = "Content-Type, Authorization" - response.headers["Access-Control-Allow-Methods"] = "GET, POST, OPTIONS" - return response app.config['PROPAGATE_EXCEPTIONS'] = True app.config['DEBUG'] = True @@ -71,10 +65,12 @@ def apply_cors(response): logging.critical(f"Failed to load embedding model: {e}") exit() -repo_cache = LRUCache(maxsize=5) +repo_cache = LRUCache(maxsize=20) # Increased from 5 to 20 repositories for 4x better cache hit rates and reduced GitHub API calls global_api_call_times = deque() GLOBAL_MAX_CALLS_PER_HOUR = 10 WINDOW_SECONDS = 3600 +MAX_FILES_TO_PROCESS = 200 # Limit number of files to prevent memory issues +MAX_FILE_SIZE = 100000 # Max file size in bytes def extract_owner_repo(repo_url: str): if "github.com" in repo_url: @@ -86,6 +82,9 @@ def extract_owner_repo(repo_url: str): raise ValueError(f"Invalid GitHub repo format: {repo_url}. Expected 'owner/repo' or a GitHub URL.") return parts[0], parts[1] +# Set of directories to skip for more efficient filtering +SKIP_DIRECTORIES = {'node_modules', 'vendor', 'dist', 'build', '__pycache__', '.git', 'venv', 'target', 'bin', 'obj'} + def summarize_code(file_path, code): summary_lines = [] lines = code.splitlines() @@ -148,12 +147,15 @@ async def get_relevant_context(repo_url, query): files_to_fetch = [ f for f in tree_json.get("tree", []) - if f['type'] == 'blob' and not f['path'].startswith('.') and f['size'] < 100000 + if f['type'] == 'blob' + and not f['path'].startswith('.') + and not any(skip_dir in f['path'].split('/') for skip_dir in SKIP_DIRECTORIES) + and f['size'] < MAX_FILE_SIZE and f['path'].endswith(( '.py', '.js', '.ts', '.tsx', '.go', '.rs', '.java', '.cs', '.php', '.rb', '.json', '.yml', '.yaml', 'Dockerfile', 'README.md', 'CONTRIBUTING.md' )) - ] + ][:MAX_FILES_TO_PROCESS] # Limit total files to process if not files_to_fetch: return None, "No relevant code or documentation files were found in this repository." logging.info(f"Identified {len(files_to_fetch)} files to fetch content for.") @@ -179,11 +181,23 @@ async def get_relevant_context(repo_url, query): file_paths = list(file_summaries.keys()) code_chunks = list(file_summaries.values()) + # Process embeddings in batches to reduce memory usage embedding_start_time = time.time() + EMBEDDING_BATCH_SIZE = 50 + all_embeddings = [] + with torch.inference_mode(): - encoded = EMBEDDING_TOKENIZER(code_chunks, padding=True, truncation=True, return_tensors='pt', max_length=512) - output = EMBEDDING_MODEL(**encoded) - embeddings = output.last_hidden_state.mean(dim=1).cpu().numpy().astype('float32') + for i in range(0, len(code_chunks), EMBEDDING_BATCH_SIZE): + batch = code_chunks[i:i + EMBEDDING_BATCH_SIZE] + encoded = EMBEDDING_TOKENIZER(batch, padding=True, truncation=True, return_tensors='pt', max_length=512) + output = EMBEDDING_MODEL(**encoded) + batch_embeddings = output.last_hidden_state.mean(dim=1).cpu().numpy().astype('float32') + all_embeddings.append(batch_embeddings) + + if not all_embeddings: + return None, "No valid embeddings could be generated from the repository files." + + embeddings = np.vstack(all_embeddings) logging.info(f"Generated {len(embeddings)} embeddings in {time.time() - embedding_start_time:.2f}s.") faiss_index_start_time = time.time() diff --git a/server/Controllers/GithubController.js b/server/Controllers/GithubController.js index 2e74586..cfc9ec3 100644 --- a/server/Controllers/GithubController.js +++ b/server/Controllers/GithubController.js @@ -2,24 +2,9 @@ const axios = require('axios'); const User = require('../models/UserModel'); const redisClient = require('../util/RediaClient'); const { Octokit } = require("@octokit/rest"); +const { createGithubApi } = require('../util/GithubApiHelper'); -const createGithubApi = async (session) => { - const headers = { 'Accept': 'application/vnd.github.v3+json' }; - - if (session?.userId) { - const user = await User.findById(session.userId); - if (user?.githubAccessToken) { - headers['Authorization'] = `token ${user.githubAccessToken}`; - console.log(`Making authenticated GitHub API request for user ${user.username}.`); - return axios.create({ baseURL: 'https://api.github.com', headers }); - } - } - - console.log('Making unauthenticated GitHub API request (fallback).'); - return axios.create({ baseURL: 'https://api.github.com', headers }); -}; - exports.getRepoTimeline = async (req, res) => { const { username, reponame } = req.params; const userId = req.session.userId || 'public'; @@ -40,20 +25,34 @@ exports.getRepoTimeline = async (req, res) => { // 2. Fetch all tags const { data: tagsData } = await githubApi.get(`/repos/${username}/${reponame}/tags`); - // 3. Fetch commits (limit to 500 using per_page and pagination) + // 3. Fetch commits (limit to 500 using per_page and pagination with early exit) const commits = []; - let page = 1; + const maxCommits = 500; const perPage = 100; + const maxPages = Math.ceil(maxCommits / perPage); + + // Use Promise.all to fetch pages in parallel for better performance + const pagePromises = []; + for (let page = 1; page <= maxPages; page++) { + pagePromises.push( + githubApi.get(`/repos/${username}/${reponame}/commits`, { + params: { per_page: perPage, page }, + }).catch(err => { + console.warn(`Failed to fetch page ${page}:`, err.message); + return { data: [] }; + }) + ); + } - while (commits.length < 500) { - const { data: pageCommits } = await githubApi.get(`/repos/${username}/${reponame}/commits`, { - params: { per_page: perPage, page }, - }); + const pageResults = await Promise.all(pagePromises); + for (const { data: pageCommits } of pageResults) { if (pageCommits.length === 0) break; commits.push(...pageCommits); - if (pageCommits.length < perPage) break; - page++; + if (commits.length >= maxCommits) break; } + + // Trim to exact limit if we exceeded - keep only first maxCommits items + const trimmedCommits = commits.slice(0, maxCommits); // Map tags to SHAs const tagMap = {}; @@ -61,7 +60,7 @@ exports.getRepoTimeline = async (req, res) => { tagMap[tag.commit.sha] = tag.name; } - const processedCommits = commits.map(commit => ({ + const processedCommits = trimmedCommits.map(commit => ({ sha: commit.sha, message: commit.commit.message, author: { @@ -108,31 +107,35 @@ exports.fetchCodeHotspots = async (req, res) => { params: { per_page: 100 } }); - const commitDetailsPromises = commitsResponse.data.map(commit => - githubApi.get(commit.url) - ); - const commitDetails = await Promise.all(commitDetailsPromises); - - const fileChurn = new Map(); - commitDetails.forEach(commitDetail => { - if (commitDetail.data.files) { - commitDetail.data.files.forEach(file => { - fileChurn.set(file.filename, (fileChurn.get(file.filename) || 0) + 1); - }); - } - }); + // Limit concurrency to avoid overwhelming the API + const CONCURRENCY_LIMIT = 10; + const fileChurn = new Map(); + + for (let i = 0; i < commitsResponse.data.length; i += CONCURRENCY_LIMIT) { + const batch = commitsResponse.data.slice(i, i + CONCURRENCY_LIMIT); + const batchPromises = batch.map(commit => githubApi.get(commit.url)); + const batchDetails = await Promise.all(batchPromises); + + batchDetails.forEach(commitDetail => { + if (commitDetail.data.files) { + commitDetail.data.files.forEach(file => { + fileChurn.set(file.filename, (fileChurn.get(file.filename) || 0) + 1); + }); + } + }); + } - const hotspots = Array.from(fileChurn, ([path, churn]) => ({ path, churn })) - .sort((a, b) => b.churn - a.churn); + const hotspots = Array.from(fileChurn, ([path, churn]) => ({ path, churn })) + .sort((a, b) => b.churn - a.churn); - await redisClient.set(cacheKey, JSON.stringify(hotspots), { EX: 3600 }); - res.json(hotspots); + await redisClient.set(cacheKey, JSON.stringify(hotspots), { EX: 3600 }); + res.json(hotspots); - } catch (error) { - console.error("Error fetching code hotspots:", error.response?.data || error.message); - res.status(error.response?.status || 500).json({ message: "Error fetching code hotspots from GitHub." }); - } - }; + } catch (error) { + console.error("Error fetching code hotspots:", error.response?.data || error.message); + res.status(error.response?.status || 500).json({ message: "Error fetching code hotspots from GitHub." }); + } +}; exports.fetchIssueTimeline = async (req, res) => { const { username, reponame, issue_number } = req.params; @@ -381,14 +384,27 @@ exports.fetchDeployments = async (req, res) => { return res.json([]); } - const statusPromises = deployments.map(deployment => - githubApi.get(deployment.statuses_url).then(statusResponse => ({ - ...deployment, - statuses: statusResponse.data - })) - ); + // Batch deployment status fetches with concurrency control + const CONCURRENCY_LIMIT = 10; + const deploymentsWithStatuses = []; - const deploymentsWithStatuses = await Promise.all(statusPromises); + for (let i = 0; i < deployments.length; i += CONCURRENCY_LIMIT) { + const batch = deployments.slice(i, i + CONCURRENCY_LIMIT); + const batchPromises = batch.map(deployment => + githubApi.get(deployment.statuses_url) + .then(statusResponse => ({ + ...deployment, + statuses: statusResponse.data + })) + .catch(err => { + console.warn(`Failed to fetch status for deployment ${deployment.id}:`, err.message); + return { ...deployment, statuses: [] }; + }) + ); + + const batchResults = await Promise.all(batchPromises); + deploymentsWithStatuses.push(...batchResults); + } const activeDeploymentUrls = new Map(); deploymentsWithStatuses.forEach(deployment => { diff --git a/server/Controllers/InsightController.js b/server/Controllers/InsightController.js index 3a7c7ac..df97c05 100644 --- a/server/Controllers/InsightController.js +++ b/server/Controllers/InsightController.js @@ -1,26 +1,7 @@ const axios = require('axios'); const User = require('../models/UserModel'); const redisClient = require('../util/RediaClient'); - -const createGithubApi = async (session) => { - const headers = { 'Accept': 'application/vnd.github.v3+json' }; - - if (session?.userId) { - try { - const user = await User.findById(session.userId); - if (user?.githubAccessToken) { - headers['Authorization'] = `token ${user.githubAccessToken}`; - console.log(`Making authenticated GitHub API request for user ${user.username}.`); - return axios.create({ baseURL: 'https://api.github.com', headers }); - } - } catch (dbError) { - console.error("Error fetching user for authenticated API call:", dbError.message); - } - } - - console.log('Making unauthenticated GitHub API request (fallback).'); - return axios.create({ baseURL: 'https://api.github.com', headers }); -}; +const { createGithubApi } = require('../util/GithubApiHelper'); exports.fetchDependencyHealth = async (req, res) => { const { username, reponame } = req.params; @@ -59,22 +40,33 @@ exports.fetchDependencyHealth = async (req, res) => { return res.json({ dependencies: [], summary: { total: 0, outdated: 0, deprecated: 0, licenses: [] } }); } - const dependencyPromises = Object.entries(dependencies).map(async ([name, version]) => { - try { - const npmResponse = await axios.get(`https://registry.npmjs.org/${name}`); - const latestVersion = npmResponse.data['dist-tags'].latest; - const license = npmResponse.data.license || 'N/A'; - const isDeprecated = !!npmResponse.data.deprecated; - const isOutdated = latestVersion !== version.replace(/[\^~>=<]/g, ''); + // Batch dependency checks with concurrency control to avoid overwhelming npm registry + const CONCURRENCY_LIMIT = 10; + const dependencyEntries = Object.entries(dependencies); + const healthReport = []; + + for (let i = 0; i < dependencyEntries.length; i += CONCURRENCY_LIMIT) { + const batch = dependencyEntries.slice(i, i + CONCURRENCY_LIMIT); + const batchPromises = batch.map(async ([name, version]) => { + try { + const npmResponse = await axios.get(`https://registry.npmjs.org/${name}`, { + timeout: 5000 // Add timeout to prevent hanging + }); + const latestVersion = npmResponse.data['dist-tags'].latest; + const license = npmResponse.data.license || 'N/A'; + const isDeprecated = !!npmResponse.data.deprecated; + const isOutdated = latestVersion !== version.replace(/[\^~>=<]/g, ''); - return { name, version, latestVersion, license, isOutdated, isDeprecated }; - } catch (error) { - console.error(`Error fetching data for ${name}:`, error.message); - return { name, version, error: 'Package not found in npm registry' }; - } - }); + return { name, version, latestVersion, license, isOutdated, isDeprecated }; + } catch (error) { + console.error(`Error fetching data for ${name}:`, error.message); + return { name, version, error: 'Package not found in npm registry' }; + } + }); - const healthReport = await Promise.all(dependencyPromises); + const batchResults = await Promise.all(batchPromises); + healthReport.push(...batchResults); + } const summary = { total: healthReport.length, diff --git a/server/api/githubApi.js b/server/api/githubApi.js index 54ab4ff..c2831ea 100644 --- a/server/api/githubApi.js +++ b/server/api/githubApi.js @@ -1,6 +1,7 @@ const axios = require('axios'); const redisClient = require('../util/RediaClient'); const User = require('../models/UserModel'); +const { createGithubApi } = require('../util/GithubApiHelper'); const githubApi = axios.create({ baseURL: 'https://api.github.com', @@ -68,21 +69,3 @@ exports.fetchRepoDetails = async (req, res) => { } }; -const createGithubApi = async (session) => { - const headers = { Accept: 'application/vnd.github.v3+json' }; - - if (session?.userId) { - const user = await User.findById(session.userId); - if (user?.githubAccessToken) { - headers['Authorization'] = `token ${user.githubAccessToken}`; - console.log( - `Making authenticated GitHub API request for user ${user.username}.` - ); - return axios.create({ baseURL: 'https://api.github.com', headers }); - } - } - - console.log('Making unauthenticated GitHub API request (fallback).'); - return axios.create({ baseURL: 'https://api.github.com', headers }); -}; - diff --git a/server/util/GithubApiHelper.js b/server/util/GithubApiHelper.js new file mode 100644 index 0000000..9d30c6b --- /dev/null +++ b/server/util/GithubApiHelper.js @@ -0,0 +1,29 @@ +const axios = require('axios'); +const User = require('../models/UserModel'); + +/** + * Creates an authenticated or unauthenticated GitHub API client based on session + * @param {Object} session - Express session object containing userId + * @returns {Promise} Axios instance configured for GitHub API + */ +const createGithubApi = async (session) => { + const headers = { 'Accept': 'application/vnd.github.v3+json' }; + + if (session?.userId) { + try { + const user = await User.findById(session.userId); + if (user?.githubAccessToken) { + headers['Authorization'] = `token ${user.githubAccessToken}`; + console.log(`Making authenticated GitHub API request for user ${user.username}.`); + return axios.create({ baseURL: 'https://api.github.com', headers }); + } + } catch (dbError) { + console.error("Error fetching user for authenticated API call:", dbError.message); + } + } + + console.log('Making unauthenticated GitHub API request (fallback).'); + return axios.create({ baseURL: 'https://api.github.com', headers }); +}; + +module.exports = { createGithubApi };