Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
219 changes: 218 additions & 1 deletion backend/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion backend/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@
"dotenv": "^17.2.3",
"express": "^5.1.0",
"jszip": "^3.10.1",
"multer": "^2.0.2"
"multer": "^2.0.2",
"pdf-parse": "^2.4.5"
},
"devDependencies": {
"@ljharb/tsconfig": "^0.3.2",
Expand Down
42 changes: 15 additions & 27 deletions backend/src/controllers/ai.controller.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import type { Request, Response, Express } from "express";
import JSZip from "jszip";
import { llmService } from "../services/llm.service.js";
import { PDFParse } from "pdf-parse";
import type {
RewriteBulletPointRequest,
GenerateCoverLetterRequest,
Expand Down Expand Up @@ -69,7 +70,7 @@ const parseResumeFile = async (file: Express.Multer.File): Promise<string> => {
const buffer = file.buffer;

if (mime.includes("pdf") || name.endsWith(".pdf")) {
return extractPdfText(buffer);
return await extractPdfText(buffer);
}

if (mime.includes("wordprocessingml") || name.endsWith(".docx")) {
Expand All @@ -83,34 +84,21 @@ const parseResumeFile = async (file: Express.Multer.File): Promise<string> => {
return buffer.toString("utf8");
};

const extractPdfText = (buffer: Buffer): string => {
const pdfString = buffer.toString("latin1");
const matches = pdfString.match(/\(([^()\\]*(?:\\.[^()\\]*)*)\)/g);
const extractPdfText = async (buffer: Buffer): Promise<string> => {
let parser: PDFParse | undefined;

if (!matches) {
return pdfString.replace(/[^\x20-\x7E\r\n]+/g, " ").replace(/\s+/g, " ");
}
try {
parser = new PDFParse({ data: buffer });
const result = await parser.getText();
const text = result.text || "";

const cleaned = matches
.map((m) => m.slice(1, -1))
.map((text) =>
text
.replace(/\\([nrtbf()\\])/g, (_match, p1) => {
if (p1 === "n") return "\n";
if (p1 === "r") return "\r";
if (p1 === "t") return "\t";
if (p1 === "b") return "\b";
if (p1 === "f") return "\f";
return p1;
})
.replace(/\\(\d{1,3})/g, (_m, octal) => {
const code = parseInt(octal, 8);
return Number.isFinite(code) ? String.fromCharCode(code) : "";
}),
)
.join(" ");

return cleaned.replace(/\s+/g, " ");
return text.replace(/\s+/g, " ").trim();
} catch (error) {
console.error("Failed to extract PDF text:", error);
return "";
} finally {
await parser?.destroy();
}
};

const extractDocBinaryText = (buffer: Buffer): string => {
Expand Down