MaFiMoi · tadlibra · Nov 13, 2025
diff --git a/src/ai-crawler/.env b/src/ai-crawler/.env
@@ -0,0 +1,6 @@
+API_BASE_URL=http://localhost:3000
+WEBHOOK_SECRET=your_secret_key_here
+
+# Optional: Proxy settings nếu cần
+# HTTP_PROXY=http://proxy:port
+# HTTPS_PROXY=https://proxy:port
diff --git a/src/ai-crawler/crawler.py b/src/ai-crawler/crawler.py
@@ -0,0 +1,204 @@
+import requests
+import time
+import os
+from datetime import datetime
+from dotenv import load_dotenv
+
+# Load environment variables
+load_dotenv()
+
+class ScamCrawler:
+    def __init__(self, api_base_url=None):
+        self.api_base_url = api_base_url or os.getenv('API_BASE_URL', 'http://localhost:3000')
+        print(f"✅ Crawler initialized with API: {self.api_base_url}")
+
+    def start_crawl(self, source_id):
+        """Bắt đầu crawl và tạo log"""
+        print(f"📡 Starting crawl for source {source_id}...")
+
+        try:
+            response = requests.post(
+                f"{self.api_base_url}/api/ai/crawl-start",
+                json={"source_id": source_id},
+                timeout=10
+            )
+            response.raise_for_status()
+            data = response.json()
+
+            if data.get("success"):
+                print(f"✅ Crawl log created: {data['data']['crawl_log_id']}")
+                return data["data"]["crawl_log_id"]
+            else:
+                raise Exception(f"API returned error: {data.get('error')}")
+
+        except requests.exceptions.RequestException as e:
+            print(f"❌ Network error: {str(e)}")
+            raise
+        except Exception as e:
+            print(f"❌ Error: {str(e)}")
+            raise
+
+    def save_phone_number(self, phone_number, source_id, confidence_score, scam_type=None, raw_data=None):
+        """Lưu số điện thoại đã crawl được"""
+        try:
+            response = requests.post(
+                f"{self.api_base_url}/api/ai/save-numbers",
+                json={
+                    "phone_number": phone_number,
+                    "source_id": source_id,
+                    "confidence_score": confidence_score,
+                    "scam_type": scam_type,
+                    "raw_data": raw_data
+                },
+                timeout=10
+            )
+            response.raise_for_status()
+            return response.json()
+        except Exception as e:
+            print(f"❌ Error saving phone {phone_number}: {str(e)}")
+            return {"success": False, "error": str(e)}
+
+    def complete_crawl(self, crawl_log_id, status, numbers_found, numbers_added, 
+                       numbers_updated, crawl_duration, error_message=None):
+        """Hoàn thành crawl và cập nhật log"""
+        try:
+            response = requests.post(
+                f"{self.api_base_url}/api/ai/crawl-complete",
+                json={
+                    "crawl_log_id": crawl_log_id,
+                    "status": status,
+                    "numbers_found": numbers_found,
+                    "numbers_added": numbers_added,
+                    "numbers_updated": numbers_updated,
+                    "crawl_duration": crawl_duration,
+                    "error_message": error_message
+                },
+                timeout=10
+            )
+            response.raise_for_status()
+            return response.json()
+        except Exception as e:
+            print(f"❌ Error completing crawl: {str(e)}")
+            return {"success": False, "error": str(e)}
+
+    def crawl_source(self, source_id, source_url):
+        """Main crawl function"""
+        print(f"\n{'='*60}")
+        print(f"🚀 Starting crawl for source {source_id}: {source_url}")
+        print(f"{'='*60}")
+
+        start_time = time.time()
+        crawl_log_id = None
+        numbers_found = 0
+        numbers_added = 0
+        numbers_updated = 0
+
+        try:
+            # 1. Bắt đầu crawl
+            crawl_log_id = self.start_crawl(source_id)
+
+            # 2. Crawl website
+            print(f"🔍 Extracting phone numbers from {source_url}...")
+            phone_numbers = self.extract_phone_numbers(source_url)
+            numbers_found = len(phone_numbers)
+            print(f"📊 Found {numbers_found} phone numbers")
+
+            # 3. Lưu từng số điện thoại
+            for i, phone_data in enumerate(phone_numbers, 1):
+                print(f"💾 Saving phone {i}/{numbers_found}: {phone_data['phone']}")
+
+                result = self.save_phone_number(
+                    phone_number=phone_data["phone"],
+                    source_id=source_id,
+                    confidence_score=phone_data["confidence"],
+                    scam_type=phone_data.get("scam_type"),
+                    raw_data=phone_data.get("raw_data")
+                )
+
+                if result.get("success"):
+                    if result["data"]["is_new"]:
+                        numbers_added += 1
+                        print(f"  ✅ New number added")
+                    else:
+                        numbers_updated += 1
+                        print(f"  ♻️  Existing number updated")
+                else:
+                    print(f"  ❌ Failed to save")
+
+            # 4. Hoàn thành crawl
+            crawl_duration = int(time.time() - start_time)
+            self.complete_crawl(
+                crawl_log_id=crawl_log_id,
+                status="success",
+                numbers_found=numbers_found,
+                numbers_added=numbers_added,
+                numbers_updated=numbers_updated,
+                crawl_duration=crawl_duration
+            )
+
+            print(f"\n{'='*60}")
+            print(f"🎉 Crawl completed successfully!")
+            print(f"📊 Stats:")
+            print(f"   - Found: {numbers_found}")
+            print(f"   - Added: {numbers_added}")
+            print(f"   - Updated: {numbers_updated}")
+            print(f"   - Duration: {crawl_duration}s")
+            print(f"{'='*60}\n")
+
+        except Exception as e:
+            print(f"\n{'='*60}")
+            print(f"❌ Error during crawl: {str(e)}")
+            print(f"{'='*60}\n")
+
+            if crawl_log_id:
+                crawl_duration = int(time.time() - start_time)
+                self.complete_crawl(
+                    crawl_log_id=crawl_log_id,
+                    status="failed",
+                    numbers_found=numbers_found,
+                    numbers_added=numbers_added,
+                    numbers_updated=numbers_updated,
+                    crawl_duration=crawl_duration,
+                    error_message=str(e)
+                )
+
+    def extract_phone_numbers(self, url):
+        """
+        Crawl website và extract số điện thoại
+        TODO: Implement actual web scraping logic
+        """
+        # Placeholder - trả về dữ liệu mẫu
+        print("⚠️  Using mock data (implement actual crawling)")
+
+        import random
+
+        mock_phones = [
+            f"09{random.randint(10000000, 99999999)}",
+            f"03{random.randint(10000000, 99999999)}",
+            f"07{random.randint(10000000, 99999999)}",
+        ]
+
+        return [
+            {
+                "phone": phone,
+                "confidence": round(random.uniform(70, 99), 1),
+                "scam_type": random.choice(["loan_scam", "investment_fraud", "impersonation"]),
+                "raw_data": {"source_text": f"Mock data for {phone}"}
+            }
+            for phone in mock_phones
+        ]
+
+# Usage example
+if __name__ == "__main__":
+    print("🤖 ScamCrawler Test")
+    print(f"⏰ Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+
+    crawler = ScamCrawler()
+
+    # Test crawl với source_id = 1
+    crawler.crawl_source(
+        source_id=1,
+        source_url="https://example-forum.com/scam-reports"
+    )
+
+    print(f"\n⏰ Finished at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
diff --git a/src/ai-crawler/requirements.txt b/src/ai-crawler/requirements.txt
@@ -0,0 +1,5 @@
+requests==2.31.0
+schedule==1.2.0
+beautifulsoup4==4.12.2
+selenium==4.15.2
+python-dotenv==1.0.0
diff --git a/src/ai-crawler/scheduler.py b/src/ai-crawler/scheduler.py
@@ -0,0 +1,74 @@
+import schedule
+import time
+import requests
+from crawler import ScamCrawler
+
+class CrawlScheduler:
+    def __init__(self, api_base_url="http://localhost:3000"):
+        self.api_base_url = api_base_url
+        self.crawler = ScamCrawler(api_base_url)
+
+    def get_active_sources(self):
+        """Lấy danh sách web sources đang hoạt động"""
+        response = requests.get(f"{self.api_base_url}/api/admin/web-sources")
+        data = response.json()
+
+        if data["success"]:
+            # Chỉ lấy sources đang active
+            return [s for s in data["data"] if s["is_active"]]
+        return []
+
+    def crawl_all_sources(self):
+        """Crawl tất cả sources"""
+        sources = self.get_active_sources()
+        print(f"Found {len(sources)} active sources")
+
+        for source in sources:
+            try:
+                self.crawler.crawl_source(
+                    source_id=source["id"],
+                    source_url=source["source_url"]
+                )
+                time.sleep(5)  # Delay giữa các requests
+            except Exception as e:
+                print(f"Error crawling source {source['id']}: {str(e)}")
+
+    def crawl_by_frequency(self, frequency):
+        """Crawl sources theo tần suất"""
+        sources = self.get_active_sources()
+        sources_to_crawl = [s for s in sources if s["crawl_frequency"] == frequency]
+
+        print(f"Crawling {len(sources_to_crawl)} sources with frequency: {frequency}")
+
+        for source in sources_to_crawl:
+            try:
+                self.crawler.crawl_source(
+                    source_id=source["id"],
+                    source_url=source["source_url"]
+                )
+                time.sleep(5)
+            except Exception as e:
+                print(f"Error: {str(e)}")
+
+    def start(self):
+        """Khởi động scheduler"""
+        print("Starting crawler scheduler...")
+
+        # Schedule hourly crawls
+        schedule.every().hour.do(lambda: self.crawl_by_frequency("hourly"))
+
+        # Schedule daily crawls (at 2 AM)
+        schedule.every().day.at("02:00").do(lambda: self.crawl_by_frequency("daily"))
+
+        # Schedule weekly crawls (Monday at 3 AM)
+        schedule.every().monday.at("03:00").do(lambda: self.crawl_by_frequency("weekly"))
+
+        print("Scheduler started. Running...")
+
+        while True:
+            schedule.run_pending()
+            time.sleep(60)  # Check every minute
+
+if __name__ == "__main__":
+    scheduler = CrawlScheduler()
+    scheduler.start()
diff --git a/src/app/api/admin/ai-logs/route.ts b/src/app/api/admin/ai-logs/route.ts
@@ -0,0 +1,68 @@
+import { NextResponse } from 'next/server';
+import pool from '@/app/lib/db';
+
+export async function GET(request: Request) {
+  try {
+    const { searchParams } = new URL(request.url);
+    const limit = parseInt(searchParams.get('limit') || '50');
+    const source_id = searchParams.get('source_id');
+    const status = searchParams.get('status');
+
+    let query = `
+      SELECT 
+        acl.*,
+        ws.source_name,
+        ws.source_url,
+        ws.priority
+      FROM ai_crawl_logs acl
+      LEFT JOIN web_sources ws ON acl.source_id = ws.id
+      WHERE 1=1
+    `;
+
+    const params: any[] = [];
+    let paramIndex = 1;
+
+    if (source_id) {
+      query += ` AND acl.source_id = $${paramIndex}`;
+      params.push(source_id);
+      paramIndex++;
+    }
+
+    if (status) {
+      query += ` AND acl.status = $${paramIndex}`;
+      params.push(status);
+      paramIndex++;
+    }
+
+    query += ` ORDER BY acl.started_at DESC LIMIT $${paramIndex}`;
+    params.push(limit);
+
+    const result = await pool.query(query, params);
+
+    const statsQuery = `
+      SELECT 
+        COUNT(*) as total_crawls,
+        COUNT(*) FILTER (WHERE status = 'success') as successful,
+        COUNT(*) FILTER (WHERE status = 'failed') as failed,
+        SUM(numbers_found) as total_numbers_found,
+        SUM(numbers_added) as total_numbers_added,
+        AVG(crawl_duration) as avg_duration
+      FROM ai_crawl_logs
+      WHERE started_at >= CURRENT_DATE - INTERVAL '30 days';
+    `;
+
+    const statsResult = await pool.query(statsQuery);
+
+    return NextResponse.json({
+      success: true,
+      data: result.rows,
+      stats: statsResult.rows[0]
+    });
+  } catch (error) {
+    console.error('Database error:', error);
+    return NextResponse.json(
+      { success: false, error: 'Failed to fetch AI logs' },
+      { status: 500 }
+    );
+  }
+}
diff --git a/...app/api/chat/admin/phone-numbers/route.ts → src/app/api/admin/phone-numbers/route.ts b/...app/api/chat/admin/phone-numbers/route.ts → src/app/api/admin/phone-numbers/route.ts
diff --git a/src/app/api/chat/admin/reports/route.ts → src/app/api/admin/reports/route.ts b/src/app/api/chat/admin/reports/route.ts → src/app/api/admin/reports/route.ts
diff --git a/src/app/api/chat/admin/stats/route.ts → src/app/api/admin/stats/route.ts b/src/app/api/chat/admin/stats/route.ts → src/app/api/admin/stats/route.ts