11/**
22 * Phone Bill Parser Backend API
3- * Extracts phone line details from AT&T PDF bills using LlamaExtract
3+ * Extracts phone line details from AT&T PDF bills using pdf-parse + regex
44 */
55
66import express , { Request , Response } from "express" ;
77import multer from "multer" ;
88import cors from "cors" ;
99import * as dotenv from "dotenv" ;
1010import { promises as fs } from "fs" ;
11- import { LlamaExtract } from "llama-cloud-services " ;
11+ import { PDFParse } from "pdf-parse " ;
1212import path from "path" ;
1313import { fileURLToPath } from "url" ;
1414
@@ -55,16 +55,6 @@ const frontendDistPath = path.join(__dirname, '../frontend/dist');
5555app . use ( express . static ( frontendDistPath ) ) ;
5656
5757// Types
58- interface PhoneLine {
59- number : string ;
60- user : string ;
61- total : number ;
62- }
63-
64- interface ExtractionResult {
65- service_activity_lines : PhoneLine [ ] ;
66- }
67-
6858interface BillData {
6959 total_amount : number ;
7060 line_count : number ;
@@ -76,41 +66,71 @@ interface BillData {
7666}
7767
7868/**
79- * Extract phone bill data using LlamaExtract agent
69+ * Extract phone bill data from AT&T PDF using regex parsing.
70+ *
71+ * Strategy:
72+ * 1. Pull full names from section headers — "Phone, XXX.XXX.XXXX\nFULL NAME"
73+ * (the page 2 summary table truncates names like "NAVEEN KUMAR ...")
74+ * 2. Pull totals from the unique "Total for XXX.XXX.XXXX $XX.XX" lines
75+ * (appears exactly once per line, no duplicates possible)
76+ * 3. Join on phone number
77+ * 4. Sanity check: sum of line totals must match "Total for Wireless $X"
8078 */
8179async function extractPhoneBill ( filePath : string ) : Promise < BillData > {
82- // Initialize LlamaExtract client (reads LLAMA_CLOUD_API_KEY from env)
83- const extractor = new LlamaExtract ( ) ;
80+ const buffer = await fs . readFile ( filePath ) ;
81+ const parser = new PDFParse ( { data : buffer } ) ;
82+ const { text } = await parser . getText ( ) ;
83+
84+ // Step 1: phone → full name
85+ // Matches "Phone, 214.957.3190\nKODUMURI VAMSHI" and "Wearable, 945.214.5965\nAPPLE WATCH"
86+ const nameMap = new Map < string , string > ( ) ;
87+ const headerRegex = / (?: P h o n e | W e a r a b l e ) , \s + ( \d { 3 } \. \d { 3 } \. \d { 4 } ) \s * \n \s * ( [ A - Z ] [ A - Z ] + ) / g;
88+ let match : RegExpExecArray | null ;
89+ while ( ( match = headerRegex . exec ( text ) ) !== null ) {
90+ nameMap . set ( match [ 1 ] , match [ 2 ] . trim ( ) ) ;
91+ }
8492
85- // Get the extraction agent by name
86- const agent = await extractor . getAgent ( "att bill extract" ) ;
87- if ( ! agent ) {
88- throw new Error ( "Extraction agent 'att bill extract' not found" ) ;
93+ // Step 2: phone → total
94+ // Matches "Total for 214.957.3190 $76.58" — appears exactly once per line
95+ const totalMap = new Map < string , number > ( ) ;
96+ const totalRegex = / T o t a l f o r ( \d { 3 } \. \d { 3 } \. \d { 4 } ) \s + \$ ( [ 0 - 9 , ] + \. \d { 2 } ) / g;
97+ while ( ( match = totalRegex . exec ( text ) ) !== null ) {
98+ totalMap . set ( match [ 1 ] , parseFloat ( match [ 2 ] . replace ( / , / g, '' ) ) ) ;
8999 }
90100
91- // Run extraction
92- const result = await agent . extract ( filePath ) ;
101+ if ( totalMap . size === 0 ) {
102+ throw new Error (
103+ "No line totals found in the PDF. " +
104+ "Make sure this is an AT&T wireless bill — other bill formats are not supported yet."
105+ ) ;
106+ }
93107
94- // Type guard to safely extract the data
95- const data = result ?. data as unknown ;
96- if ( ! data || typeof data !== 'object' || ! ( 'service_activity_lines' in data ) ) {
97- throw new Error ( "No service activity lines found in extraction result" ) ;
108+ // Step 3: build line items, join name + total on phone number
109+ const lines = Array . from ( totalMap . entries ( ) ) . map ( ( [ phone , amount ] ) => ( {
110+ phone_number : phone ,
111+ line_name : nameMap . get ( phone ) ?? "Unknown" ,
112+ amount_owed : amount ,
113+ } ) ) ;
114+
115+ // Step 4: sanity check — line totals must sum to the wireless section total
116+ const lineSum = Math . round ( lines . reduce ( ( sum , l ) => sum + l . amount_owed , 0 ) * 100 ) / 100 ;
117+ const wirelessMatch = text . match ( / T o t a l f o r W i r e l e s s \s + \$ ( [ 0 - 9 , ] + \. \d { 2 } ) / ) ;
118+ if ( wirelessMatch ) {
119+ const wirelessTotal = parseFloat ( wirelessMatch [ 1 ] . replace ( / , / g, '' ) ) ;
120+ if ( Math . abs ( lineSum - wirelessTotal ) > 0.02 ) {
121+ throw new Error (
122+ `Parse validation failed: line totals sum ($${ lineSum . toFixed ( 2 ) } ) ` +
123+ `does not match wireless total ($${ wirelessTotal . toFixed ( 2 ) } ). ` +
124+ "The bill may have an unsupported charge type — please verify manually."
125+ ) ;
126+ }
98127 }
99128
100- const extractedData = data as ExtractionResult ;
101-
102- // Transform to desired format
103- const billData : BillData = {
104- total_amount : extractedData . service_activity_lines . reduce ( ( sum , line ) => sum + line . total , 0 ) ,
105- line_count : extractedData . service_activity_lines . length ,
106- lines : extractedData . service_activity_lines . map ( line => ( {
107- phone_number : line . number ,
108- line_name : line . user ,
109- amount_owed : line . total
110- } ) )
129+ return {
130+ total_amount : lineSum ,
131+ line_count : lines . length ,
132+ lines,
111133 } ;
112-
113- return billData ;
114134}
115135
116136// Health check endpoint
@@ -126,13 +146,6 @@ app.post("/api/extract", upload.single("file"), async (req: Request, res: Respon
126146 let uploadedFilePath : string | undefined ;
127147
128148 try {
129- // Validate API key
130- if ( ! process . env . LLAMA_CLOUD_API_KEY ) {
131- return res . status ( 500 ) . json ( {
132- error : "Server configuration error: LLAMA_CLOUD_API_KEY not set"
133- } ) ;
134- }
135-
136149 // Validate file upload
137150 if ( ! req . file ) {
138151 return res . status ( 400 ) . json ( {
0 commit comments