diff --git a/backend/package-lock.json b/backend/package-lock.json index c591b42..fb45a27 100644 --- a/backend/package-lock.json +++ b/backend/package-lock.json @@ -15,7 +15,8 @@ "dotenv": "^17.2.3", "express": "^5.1.0", "jszip": "^3.10.1", - "multer": "^2.0.2" + "multer": "^2.0.2", + "pdf-parse": "^2.4.5" }, "devDependencies": { "@ljharb/tsconfig": "^0.3.2", @@ -753,6 +754,190 @@ "node": ">= 0.4" } }, + "node_modules/@napi-rs/canvas": { + "version": "0.1.80", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.80.tgz", + "integrity": "sha512-DxuT1ClnIPts1kQx8FBmkk4BQDTfI5kIzywAaMjQSXfNnra5UFU9PwurXrl+Je3bJ6BGsp/zmshVVFbCmyI+ww==", + "license": "MIT", + "workspaces": [ + "e2e/*" + ], + "engines": { + "node": ">= 10" + }, + "optionalDependencies": { + "@napi-rs/canvas-android-arm64": "0.1.80", + "@napi-rs/canvas-darwin-arm64": "0.1.80", + "@napi-rs/canvas-darwin-x64": "0.1.80", + "@napi-rs/canvas-linux-arm-gnueabihf": "0.1.80", + "@napi-rs/canvas-linux-arm64-gnu": "0.1.80", + "@napi-rs/canvas-linux-arm64-musl": "0.1.80", + "@napi-rs/canvas-linux-riscv64-gnu": "0.1.80", + "@napi-rs/canvas-linux-x64-gnu": "0.1.80", + "@napi-rs/canvas-linux-x64-musl": "0.1.80", + "@napi-rs/canvas-win32-x64-msvc": "0.1.80" + } + }, + "node_modules/@napi-rs/canvas-android-arm64": { + "version": "0.1.80", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-android-arm64/-/canvas-android-arm64-0.1.80.tgz", + "integrity": "sha512-sk7xhN/MoXeuExlggf91pNziBxLPVUqF2CAVnB57KLG/pz7+U5TKG8eXdc3pm0d7Od0WreB6ZKLj37sX9muGOQ==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@napi-rs/canvas-darwin-arm64": { + "version": "0.1.80", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-arm64/-/canvas-darwin-arm64-0.1.80.tgz", + "integrity": "sha512-O64APRTXRUiAz0P8gErkfEr3lipLJgM6pjATwavZ22ebhjYl/SUbpgM0xcWPQBNMP1n29afAC/Us5PX1vg+JNQ==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@napi-rs/canvas-darwin-x64": { + "version": "0.1.80", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-x64/-/canvas-darwin-x64-0.1.80.tgz", + "integrity": "sha512-FqqSU7qFce0Cp3pwnTjVkKjjOtxMqRe6lmINxpIZYaZNnVI0H5FtsaraZJ36SiTHNjZlUB69/HhxNDT1Aaa9vA==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@napi-rs/canvas-linux-arm-gnueabihf": { + "version": "0.1.80", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm-gnueabihf/-/canvas-linux-arm-gnueabihf-0.1.80.tgz", + "integrity": "sha512-eyWz0ddBDQc7/JbAtY4OtZ5SpK8tR4JsCYEZjCE3dI8pqoWUC8oMwYSBGCYfsx2w47cQgQCgMVRVTFiiO38hHQ==", + "cpu": [ + "arm" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@napi-rs/canvas-linux-arm64-gnu": { + "version": "0.1.80", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-gnu/-/canvas-linux-arm64-gnu-0.1.80.tgz", + "integrity": "sha512-qwA63t8A86bnxhuA/GwOkK3jvb+XTQaTiVML0vAWoHyoZYTjNs7BzoOONDgTnNtr8/yHrq64XXzUoLqDzU+Uuw==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@napi-rs/canvas-linux-arm64-musl": { + "version": "0.1.80", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-musl/-/canvas-linux-arm64-musl-0.1.80.tgz", + "integrity": "sha512-1XbCOz/ymhj24lFaIXtWnwv/6eFHXDrjP0jYkc6iHQ9q8oXKzUX1Lc6bu+wuGiLhGh2GS/2JlfORC5ZcXimRcg==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@napi-rs/canvas-linux-riscv64-gnu": { + "version": "0.1.80", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-riscv64-gnu/-/canvas-linux-riscv64-gnu-0.1.80.tgz", + "integrity": "sha512-XTzR125w5ZMs0lJcxRlS1K3P5RaZ9RmUsPtd1uGt+EfDyYMu4c6SEROYsxyatbbu/2+lPe7MPHOO/0a0x7L/gw==", + "cpu": [ + "riscv64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@napi-rs/canvas-linux-x64-gnu": { + "version": "0.1.80", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-gnu/-/canvas-linux-x64-gnu-0.1.80.tgz", + "integrity": "sha512-BeXAmhKg1kX3UCrJsYbdQd3hIMDH/K6HnP/pG2LuITaXhXBiNdh//TVVVVCBbJzVQaV5gK/4ZOCMrQW9mvuTqA==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@napi-rs/canvas-linux-x64-musl": { + "version": "0.1.80", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-musl/-/canvas-linux-x64-musl-0.1.80.tgz", + "integrity": "sha512-x0XvZWdHbkgdgucJsRxprX/4o4sEed7qo9rCQA9ugiS9qE2QvP0RIiEugtZhfLH3cyI+jIRFJHV4Fuz+1BHHMg==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@napi-rs/canvas-win32-x64-msvc": { + "version": "0.1.80", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-x64-msvc/-/canvas-win32-x64-msvc-0.1.80.tgz", + "integrity": "sha512-Z8jPsM6df5V8B1HrCHB05+bDiCxjE9QA//3YrkKIdVDEwn5RKaqOxCJDRJkl48cJbylcrJbW4HxZbTte8juuPg==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">= 10" + } + }, "node_modules/@nodelib/fs.scandir": { "version": "2.1.5", "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz", @@ -4722,6 +4907,38 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/pdf-parse": { + "version": "2.4.5", + "resolved": "https://registry.npmjs.org/pdf-parse/-/pdf-parse-2.4.5.tgz", + "integrity": "sha512-mHU89HGh7v+4u2ubfnevJ03lmPgQ5WU4CxAVmTSh/sxVTEDYd1er/dKS/A6vg77NX47KTEoihq8jZBLr8Cxuwg==", + "license": "Apache-2.0", + "dependencies": { + "@napi-rs/canvas": "0.1.80", + "pdfjs-dist": "5.4.296" + }, + "bin": { + "pdf-parse": "bin/cli.mjs" + }, + "engines": { + "node": ">=20.16.0 <21 || >=22.3.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/mehmet-kozan" + } + }, + "node_modules/pdfjs-dist": { + "version": "5.4.296", + "resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-5.4.296.tgz", + "integrity": "sha512-DlOzet0HO7OEnmUmB6wWGJrrdvbyJKftI1bhMitK7O2N8W2gc757yyYBbINy9IDafXAV9wmKr9t7xsTaNKRG5Q==", + "license": "Apache-2.0", + "engines": { + "node": ">=20.16.0 || >=22.3.0" + }, + "optionalDependencies": { + "@napi-rs/canvas": "^0.1.80" + } + }, "node_modules/picomatch": { "version": "2.3.1", "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz", diff --git a/backend/package.json b/backend/package.json index a466423..4c60647 100644 --- a/backend/package.json +++ b/backend/package.json @@ -30,7 +30,8 @@ "dotenv": "^17.2.3", "express": "^5.1.0", "jszip": "^3.10.1", - "multer": "^2.0.2" + "multer": "^2.0.2", + "pdf-parse": "^2.4.5" }, "devDependencies": { "@ljharb/tsconfig": "^0.3.2", diff --git a/backend/src/controllers/ai.controller.ts b/backend/src/controllers/ai.controller.ts index 34f8ac0..32209b5 100644 --- a/backend/src/controllers/ai.controller.ts +++ b/backend/src/controllers/ai.controller.ts @@ -1,6 +1,7 @@ import type { Request, Response, Express } from "express"; import JSZip from "jszip"; import { llmService } from "../services/llm.service.js"; +import { PDFParse } from "pdf-parse"; import type { RewriteBulletPointRequest, GenerateCoverLetterRequest, @@ -69,7 +70,7 @@ const parseResumeFile = async (file: Express.Multer.File): Promise => { const buffer = file.buffer; if (mime.includes("pdf") || name.endsWith(".pdf")) { - return extractPdfText(buffer); + return await extractPdfText(buffer); } if (mime.includes("wordprocessingml") || name.endsWith(".docx")) { @@ -83,34 +84,21 @@ const parseResumeFile = async (file: Express.Multer.File): Promise => { return buffer.toString("utf8"); }; -const extractPdfText = (buffer: Buffer): string => { - const pdfString = buffer.toString("latin1"); - const matches = pdfString.match(/\(([^()\\]*(?:\\.[^()\\]*)*)\)/g); +const extractPdfText = async (buffer: Buffer): Promise => { + let parser: PDFParse | undefined; - if (!matches) { - return pdfString.replace(/[^\x20-\x7E\r\n]+/g, " ").replace(/\s+/g, " "); - } + try { + parser = new PDFParse({ data: buffer }); + const result = await parser.getText(); + const text = result.text || ""; - const cleaned = matches - .map((m) => m.slice(1, -1)) - .map((text) => - text - .replace(/\\([nrtbf()\\])/g, (_match, p1) => { - if (p1 === "n") return "\n"; - if (p1 === "r") return "\r"; - if (p1 === "t") return "\t"; - if (p1 === "b") return "\b"; - if (p1 === "f") return "\f"; - return p1; - }) - .replace(/\\(\d{1,3})/g, (_m, octal) => { - const code = parseInt(octal, 8); - return Number.isFinite(code) ? String.fromCharCode(code) : ""; - }), - ) - .join(" "); - - return cleaned.replace(/\s+/g, " "); + return text.replace(/\s+/g, " ").trim(); + } catch (error) { + console.error("Failed to extract PDF text:", error); + return ""; + } finally { + await parser?.destroy(); + } }; const extractDocBinaryText = (buffer: Buffer): string => {