From 9c0bf0095f287a3c783029707778e52b5dd8145a Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 1 Dec 2025 02:09:40 +0000 Subject: [PATCH 1/3] Add MapReduce/Pipeline to Cloud Run Jobs migration plan MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Comprehensive migration plan covering: - 11 simple mapper jobs → Cloud Run Jobs - 4 MapReduce pipeline jobs → Cloud Run Jobs - 2 custom Pipeline orchestrations → Cloud Workflows - Utility module ports (fb_mapreduce, mr.py) - Infrastructure setup (base image, job framework) - Configuration updates (queue.yaml, Cloud Scheduler) Organized by phase with complexity ratings and FB API requirements. --- server/MIGRATION_PLAN.md | 348 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 348 insertions(+) create mode 100644 server/MIGRATION_PLAN.md diff --git a/server/MIGRATION_PLAN.md b/server/MIGRATION_PLAN.md new file mode 100644 index 00000000..9d452a49 --- /dev/null +++ b/server/MIGRATION_PLAN.md @@ -0,0 +1,348 @@ +# MapReduce/Pipeline Migration Plan + +This document outlines the migration from legacy App Engine MapReduce/Pipeline to modern Google Cloud services. + +## Migration Strategy + +| Legacy Pattern | Modern Replacement | +|----------------|-------------------| +| `start_map()` (mapper only) | **Cloud Run Jobs** | +| `MapreducePipeline` (map+reduce) | **Cloud Run Jobs** (simple) or **Cloud Dataflow** (complex) | +| `Pipeline` orchestration | **Cloud Workflows** | +| Task Queues | **Cloud Tasks** (already compatible) | + +--- + +## Phase 1: Infrastructure Setup + +### Task 1.1: Create Cloud Run Job Base Image +- **File to create**: `server/cloud_run/Dockerfile.jobs` +- **Purpose**: Base Docker image for all batch jobs +- **Contents**: Python runtime, common dependencies, Datastore client, GCS client +- **Priority**: HIGH (blocking for all other migrations) + +### Task 1.2: Create Job Runner Framework +- **File to create**: `server/dancedeets/jobs/base.py` +- **Purpose**: Base class for Cloud Run Jobs replacing mapreduce patterns +- **Features needed**: + - Datastore entity iteration with cursor-based pagination + - Parallel task execution (Cloud Run Jobs supports up to 10,000 parallel tasks) + - Counter/metrics collection + - GCS output writer + - Facebook API token injection (port from `fb_mapreduce.py`) + +### Task 1.3: Create Cloud Workflows Templates +- **File to create**: `server/workflows/` +- **Purpose**: YAML workflow definitions for orchestrated jobs +- **Priority**: MEDIUM (only needed for Pipeline migrations) + +--- + +## Phase 2: Simple Mapper Jobs → Cloud Run Jobs + +These jobs iterate over entities and perform side effects (no reduce step, no GCS output). + +### Task 2.1: `notifications/added_events.py` +- **Current**: `promote_events_to_user` via `start_map()` +- **Entity**: `User` (filtered by timezone_offset) +- **Action**: Sends push notifications for new events +- **Migration**: + 1. Create `server/dancedeets/jobs/notify_users.py` + 2. Query users by timezone offset + 3. For each user: search events, create Android push notification + 4. Schedule via Cloud Scheduler (hourly, matching current cron) +- **Complexity**: LOW +- **Facebook API**: No + +### Task 2.2: `pubsub/pubsub_tasks.py` +- **Current**: `map_post_jp_event` via `start_map()` +- **Entity**: `DBEvent` (filtered by TIME_FUTURE) +- **Action**: Posts Japan events to social media +- **Migration**: + 1. Create `server/dancedeets/jobs/post_japan_events.py` + 2. Query future events ending with 'Japan' + 3. Post to Twitter/social via pubsub module +- **Complexity**: LOW +- **Facebook API**: No + +### Task 2.3: `rankings/rankings.py` +- **Current**: `count_event_for_city`, `count_user_for_city` via `start_map()` +- **Entity**: `DBEvent` or `User` +- **Action**: Counts events/users by city, stores in counters +- **Migration**: + 1. Create `server/dancedeets/jobs/compute_rankings.py` + 2. Use in-memory counters (dict) instead of mapreduce counters + 3. Query entities, increment counters by city/country + 4. Call `_compute_summary()` at job end +- **Complexity**: LOW +- **Facebook API**: No + +### Task 2.4: `users/user_event_tasks.py` +- **Current**: `map_compute_user_stats` via `start_map()` +- **Entity**: `User` +- **Action**: Computes event statistics per user +- **Migration**: + 1. Create `server/dancedeets/jobs/compute_user_stats.py` + 2. Query all users + 3. For each: query PotentialEvent by source_ids, count by creating_method + 4. Update user properties +- **Complexity**: LOW +- **Facebook API**: No + +### Task 2.5: `users/user_tasks.py` +- **Current**: `map_load_fb_user` via `start_map()` +- **Entity**: `User` (optionally filtered by expired_oauth_token) +- **Action**: Refreshes user profile from Facebook +- **Migration**: + 1. Create `server/dancedeets/jobs/refresh_users.py` + 2. Query users (optionally skip expired tokens) + 3. For each: fetch LookupUser from FB, update Mailchimp, compute_derived_properties() + 4. Handle ExpiredOAuthToken exceptions +- **Complexity**: MEDIUM +- **Facebook API**: Yes (needs token handling) + +### Task 2.6: `search/email_events.py` +- **Current**: `map_email_user` via `start_map()` +- **Entity**: `User` +- **Action**: Sends weekly event digest emails +- **Migration**: + 1. Create `server/dancedeets/jobs/send_weekly_emails.py` + 2. Query all users + 3. For each: search events, render HTML via render_server, send via Mandrill + 4. Update user.weekly_email_send_date + 5. Handle NoEmailException, ExpiredOAuthToken +- **Complexity**: MEDIUM +- **Facebook API**: Yes (needs token handling) + +--- + +## Phase 3: Mapper Jobs with GCS Output → Cloud Run Jobs + +These jobs iterate and write results to Google Cloud Storage. + +### Task 3.1: `sitemaps/events.py` +- **Current**: `map_sitemap_event` via `start_map()` with output writer +- **Entity**: `DBEvent` (filtered by vertical, time_period) +- **Output**: XML sitemap to GCS +- **Migration**: + 1. Create `server/dancedeets/jobs/generate_sitemaps.py` + 2. Query events by filters + 3. Generate XML entries with lxml + 4. Write to GCS using google-cloud-storage client + 5. Handle file splitting if needed (sitemaps have size limits) +- **Complexity**: MEDIUM +- **Facebook API**: No + +### Task 3.2: `logic/mr_dump.py` +- **Current**: `map_dump_fb_json` via `start_map()` with output writer +- **Entity**: `PotentialEvent` (filtered by looked_at=None) +- **Output**: CSV to GCS +- **Migration**: + 1. Create `server/dancedeets/jobs/dump_potential_events.py` + 2. Query PotentialEvents not yet looked at + 3. Batch fetch from Facebook API + 4. Write CSV rows to GCS +- **Complexity**: MEDIUM +- **Facebook API**: Yes (batch_fetch) + +### Task 3.3: `ml/gprediction.py` +- **Current**: `map_training_data_for_pevents` via `start_map()` with output writer +- **Entity**: `PotentialEvent` +- **Output**: ML training data CSV to GCS +- **Migration**: + 1. Create `server/dancedeets/jobs/generate_training_data.py` + 2. Query PotentialEvents + 3. Fetch event details and attending from Facebook + 4. Extract training features + 5. Write to GCS +- **Complexity**: MEDIUM +- **Facebook API**: Yes + +### Task 3.4: `ml/mr_prediction.py` +- **Current**: `map_classify_events` via `start_map()` with output writer +- **Entity**: `PotentialEvent` (filtered by looked_at=None) +- **Output**: Classification results to GCS +- **Migration**: + 1. Create `server/dancedeets/jobs/classify_events_ml.py` + 2. Query unprocessed PotentialEvents + 3. Batch Facebook API requests + 4. Call Google Prediction API + 5. Write results to GCS +- **Complexity**: HIGH (ML service integration) +- **Facebook API**: Yes + +### Task 3.5: `event_scraper/auto_add.py` +- **Current**: `map_classify_events` via `start_map()` with output writer +- **Entity**: `PotentialEvent` (filtered by should_look_at, past_event) +- **Action**: Auto-adds dance events, writes results to GCS +- **Migration**: + 1. Create `server/dancedeets/jobs/auto_add_events.py` + 2. Query PotentialEvents matching criteria + 3. Run NLP classifier, attendee classifier + 4. Create DBEvent via add_entities.add_update_fb_event() + 5. Update PotentialEvent.looked_at, auto_looked_at + 6. Write summary to GCS +- **Complexity**: HIGH (multiple classifiers, entity creation) +- **Facebook API**: Yes + +--- + +## Phase 4: MapReduce Pipeline Jobs → Cloud Run Jobs or Dataflow + +These have both map and reduce steps. + +### Task 4.1: `logic/unique_attendees.py` +- **Current**: `MapreducePipeline` with mapper + reducer +- **Map**: Emits (city, attendee_id) from each event +- **Reduce**: Counts unique attendees per city +- **Migration Options**: + - **Option A (Cloud Run Jobs)**: Single job with in-memory aggregation + 1. Create `server/dancedeets/jobs/count_unique_attendees.py` + 2. Query all FB events + 3. Use `dict[city, set[attendee_id]]` for uniqueness + 4. Write final counts to GCS + - **Option B (Cloud Dataflow)**: Apache Beam pipeline (if scale demands) +- **Complexity**: MEDIUM +- **Facebook API**: Yes (batch_fetch for attending) + +### Task 4.2: `event_scraper/thing_db.py` +- **Current**: `MapreducePipeline` - counts events per source +- **Map**: `explode_per_source_count` - emits counts per source +- **Reduce**: `combine_source_count` - sums and updates Source entities +- **Migration**: + 1. Create `server/dancedeets/jobs/update_source_stats.py` + 2. Query all PotentialEvents + 3. Aggregate counts by source_id in memory + 4. Batch update Source entities +- **Complexity**: MEDIUM +- **Facebook API**: Yes + +### Task 4.3: `event_scraper/thing_scraper2.py` +- **Current**: `MapreducePipeline` - scrapes sources then processes events +- **Map**: `scrape_sources_for_events` - discovers events from sources +- **Reduce**: `process_events` - classifies discovered events +- **Migration**: + 1. Create `server/dancedeets/jobs/scrape_and_classify.py` + 2. Query all Sources (filtered by min_potential_events) + 3. Scrape each source for events + 4. Process through event_pipeline.process_discovered_events() +- **Complexity**: HIGH (multi-stage, external scraping) +- **Facebook API**: Yes + +### Task 4.4: `events/find_access_tokens.py` +- **Current**: Complex multi-stage `MapreducePipeline` +- **Stages**: Find events → Combine → Find tokens → Save +- **Migration**: + 1. This is best migrated to **Cloud Workflows** orchestrating multiple Cloud Run Jobs + 2. Create workflow: `server/workflows/find_access_tokens.yaml` + 3. Create jobs: + - `server/dancedeets/jobs/find_events_needing_tokens.py` + - `server/dancedeets/jobs/test_user_tokens.py` + - `server/dancedeets/jobs/save_valid_tokens.py` + 4. Workflow coordinates: job1 → job2 → job3 +- **Complexity**: HIGH (multi-stage orchestration) +- **Facebook API**: Yes + +--- + +## Phase 5: Custom Pipeline Jobs → Cloud Workflows + +### Task 5.1: `classes/class_pipeline.py` +- **Current**: `CrawlAndIndexClassesJob` Pipeline with 4 stages +- **Stages**: + 1. `start_spiders` - Triggers ScrapingHub spiders + 2. `WaitForJobs` - Polls for completion (30s retries) + 3. `ReindexClasses` - Rebuilds class search index + 4. `EmailErrors` - Sends error report via Mandrill +- **Migration**: + 1. Create workflow: `server/workflows/crawl_and_index_classes.yaml` + 2. Create Cloud Run Jobs: + - `server/dancedeets/jobs/start_spiders.py` + - `server/dancedeets/jobs/reindex_classes.py` + - `server/dancedeets/jobs/email_crawl_errors.py` + 3. Use Cloud Workflows built-in retry/polling for WaitForJobs + 4. Wire up: start_spiders → poll_completion → reindex → email_errors +- **Complexity**: MEDIUM (mostly orchestration) +- **Facebook API**: No + +--- + +## Phase 6: Utility Module Updates + +### Task 6.1: Port `util/fb_mapreduce.py` +- **Current**: Facebook token injection for mapreduce +- **New**: `server/dancedeets/jobs/fb_utils.py` +- **Features to port**: + - `get_fblookup()` - Get FBLookup with access token + - `get_multiple_tokens()` - Token rotation for long jobs + - Batch Facebook API request handling + +### Task 6.2: Port `util/mr.py` +- **Current**: Counter utilities for mapreduce +- **New**: `server/dancedeets/jobs/metrics.py` +- **Features**: + - In-memory counter implementation + - Optional Cloud Monitoring integration + +### Task 6.3: Deprecate Compatibility Layer +- **Files**: `server/dancedeets/compat/mapreduce/`, `server/dancedeets/compat/pipeline/` +- **Action**: Once all jobs migrated, remove compat layer entirely + +--- + +## Phase 7: Configuration & Deployment + +### Task 7.1: Update `queue.yaml` → Cloud Tasks +- Migrate queue definitions to Cloud Tasks API +- Update queue references in job code + +### Task 7.2: Create Cloud Run Job Definitions +- **File**: `server/cloudbuild.yaml` or Terraform configs +- Define all Cloud Run Jobs with resource limits + +### Task 7.3: Create Cloud Scheduler Triggers +- Replace App Engine cron with Cloud Scheduler +- Schedule all periodic jobs + +### Task 7.4: Create Cloud Workflows Definitions +- Deploy workflow YAML files +- Set up workflow triggers + +### Task 7.5: Update `batch.yaml` +- Either remove (if batch service no longer needed) or update for Cloud Run + +--- + +## File-by-File Migration Checklist + +| File | Current Pattern | Target | Priority | Complexity | FB API | +|------|-----------------|--------|----------|------------|--------| +| `notifications/added_events.py` | start_map | Cloud Run Job | HIGH | LOW | No | +| `pubsub/pubsub_tasks.py` | start_map | Cloud Run Job | MEDIUM | LOW | No | +| `rankings/rankings.py` | start_map | Cloud Run Job | MEDIUM | LOW | No | +| `users/user_event_tasks.py` | start_map | Cloud Run Job | MEDIUM | LOW | No | +| `users/user_tasks.py` | start_map | Cloud Run Job | MEDIUM | MEDIUM | Yes | +| `search/email_events.py` | start_map | Cloud Run Job | MEDIUM | MEDIUM | Yes | +| `sitemaps/events.py` | start_map+output | Cloud Run Job | HIGH | MEDIUM | No | +| `logic/mr_dump.py` | start_map+output | Cloud Run Job | LOW | MEDIUM | Yes | +| `ml/gprediction.py` | start_map+output | Cloud Run Job | LOW | MEDIUM | Yes | +| `ml/mr_prediction.py` | start_map+output | Cloud Run Job | LOW | HIGH | Yes | +| `event_scraper/auto_add.py` | start_map+output | Cloud Run Job | HIGH | HIGH | Yes | +| `logic/unique_attendees.py` | MapreducePipeline | Cloud Run Job | LOW | MEDIUM | Yes | +| `event_scraper/thing_db.py` | MapreducePipeline | Cloud Run Job | MEDIUM | MEDIUM | Yes | +| `event_scraper/thing_scraper2.py` | MapreducePipeline | Cloud Run Job | HIGH | HIGH | Yes | +| `events/find_access_tokens.py` | Multi-stage Pipeline | Cloud Workflows | LOW | HIGH | Yes | +| `classes/class_pipeline.py` | Custom Pipeline | Cloud Workflows | MEDIUM | MEDIUM | No | + +--- + +## Recommended Migration Order + +1. **Infrastructure** (Task 1.1-1.3) - Required first +2. **Simple side-effect jobs** (Task 2.1-2.4) - Quick wins, no FB API +3. **FB API jobs** (Task 2.5-2.6) - After FB token handling is ported +4. **GCS output jobs** (Task 3.1) - Sitemaps are user-facing +5. **Event processing jobs** (Task 3.5, 4.3) - Core functionality +6. **ML jobs** (Task 3.2-3.4) - Lower priority, complex +7. **Pipeline orchestration** (Task 5.1, 4.4) - After individual jobs work +8. **Cleanup** (Task 6.3, 7.x) - Final phase From c7608184253ab3c5301c6c5ce0c744779baac216 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 1 Dec 2025 02:40:38 +0000 Subject: [PATCH 2/3] Implement Cloud Run Jobs framework to replace MapReduce/Pipeline This commit adds a complete Cloud Run Jobs framework that replaces the legacy App Engine MapReduce and Pipeline libraries. Infrastructure (Phase 1): - dancedeets/jobs/base.py: Job, BatchJob, JobRunner base classes - dancedeets/jobs/fb_utils.py: Facebook API token handling - dancedeets/jobs/metrics.py: Counter/metrics tracking - dancedeets/jobs/gcs_output.py: GCS output writer - dancedeets/jobs/runner.py: CLI entry point - Dockerfile.jobs: Container for Cloud Run Jobs - requirements-jobs.txt: Job dependencies Simple Mapper Jobs (Phase 2): - notify_users.py: Push notifications by timezone - post_japan_events.py: Post Japan events to social - compute_rankings.py: City/country rankings - compute_user_stats.py: User event statistics - refresh_users.py: Refresh Facebook profiles - send_weekly_emails.py: Weekly digest emails GCS Output Jobs (Phase 3): - generate_sitemaps.py: XML sitemap generation - dump_potential_events.py: Export potential events - generate_training_data.py: ML training data - classify_events_ml.py: ML event classification - auto_add_events.py: Auto-add dance events MapReduce Pipeline Replacements (Phase 4): - count_unique_attendees.py: Unique RSVPs by city - update_source_stats.py: Source quality metrics - scrape_and_classify.py: Scrape and classify events Cloud Workflows (Phase 5): - workflows/crawl_and_index_classes.yaml: Orchestration - start_spiders.py: Start ScrapingHub spiders - reindex_classes.py: Rebuild class index - email_crawl_errors.py: Send error reports --- server/Dockerfile.jobs | 31 ++ server/MIGRATION_PLAN.md | 52 +++ server/dancedeets/jobs/__init__.py | 51 +++ server/dancedeets/jobs/auto_add_events.py | 300 ++++++++++++++ server/dancedeets/jobs/base.py | 379 ++++++++++++++++++ server/dancedeets/jobs/classify_events_ml.py | 188 +++++++++ server/dancedeets/jobs/compute_rankings.py | 219 ++++++++++ server/dancedeets/jobs/compute_user_stats.py | 151 +++++++ .../dancedeets/jobs/count_unique_attendees.py | 195 +++++++++ .../dancedeets/jobs/dump_potential_events.py | 160 ++++++++ server/dancedeets/jobs/email_crawl_errors.py | 175 ++++++++ server/dancedeets/jobs/fb_utils.py | 185 +++++++++ server/dancedeets/jobs/gcs_output.py | 167 ++++++++ server/dancedeets/jobs/generate_sitemaps.py | 219 ++++++++++ .../dancedeets/jobs/generate_training_data.py | 228 +++++++++++ server/dancedeets/jobs/metrics.py | 151 +++++++ server/dancedeets/jobs/notify_users.py | 170 ++++++++ server/dancedeets/jobs/post_japan_events.py | 93 +++++ server/dancedeets/jobs/refresh_users.py | 174 ++++++++ server/dancedeets/jobs/reindex_classes.py | 50 +++ server/dancedeets/jobs/runner.py | 141 +++++++ server/dancedeets/jobs/scrape_and_classify.py | 192 +++++++++ server/dancedeets/jobs/send_weekly_emails.py | 292 ++++++++++++++ server/dancedeets/jobs/start_spiders.py | 120 ++++++ server/dancedeets/jobs/update_source_stats.py | 130 ++++++ server/requirements-jobs.txt | 22 + server/workflows/crawl_and_index_classes.yaml | 104 +++++ 27 files changed, 4339 insertions(+) create mode 100644 server/Dockerfile.jobs create mode 100644 server/dancedeets/jobs/__init__.py create mode 100644 server/dancedeets/jobs/auto_add_events.py create mode 100644 server/dancedeets/jobs/base.py create mode 100644 server/dancedeets/jobs/classify_events_ml.py create mode 100644 server/dancedeets/jobs/compute_rankings.py create mode 100644 server/dancedeets/jobs/compute_user_stats.py create mode 100644 server/dancedeets/jobs/count_unique_attendees.py create mode 100644 server/dancedeets/jobs/dump_potential_events.py create mode 100644 server/dancedeets/jobs/email_crawl_errors.py create mode 100644 server/dancedeets/jobs/fb_utils.py create mode 100644 server/dancedeets/jobs/gcs_output.py create mode 100644 server/dancedeets/jobs/generate_sitemaps.py create mode 100644 server/dancedeets/jobs/generate_training_data.py create mode 100644 server/dancedeets/jobs/metrics.py create mode 100644 server/dancedeets/jobs/notify_users.py create mode 100644 server/dancedeets/jobs/post_japan_events.py create mode 100644 server/dancedeets/jobs/refresh_users.py create mode 100644 server/dancedeets/jobs/reindex_classes.py create mode 100644 server/dancedeets/jobs/runner.py create mode 100644 server/dancedeets/jobs/scrape_and_classify.py create mode 100644 server/dancedeets/jobs/send_weekly_emails.py create mode 100644 server/dancedeets/jobs/start_spiders.py create mode 100644 server/dancedeets/jobs/update_source_stats.py create mode 100644 server/requirements-jobs.txt create mode 100644 server/workflows/crawl_and_index_classes.yaml diff --git a/server/Dockerfile.jobs b/server/Dockerfile.jobs new file mode 100644 index 00000000..b3a785ed --- /dev/null +++ b/server/Dockerfile.jobs @@ -0,0 +1,31 @@ +# Cloud Run Jobs Dockerfile +# Used for batch processing jobs that replace MapReduce/Pipeline + +FROM python:3.11-slim + +# Set environment variables +ENV PYTHONUNBUFFERED=1 +ENV PYTHONDONTWRITEBYTECODE=1 +ENV GOOGLE_CLOUD_PROJECT=dancedeets-hrd + +# Install system dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + gcc \ + libffi-dev \ + && rm -rf /var/lib/apt/lists/* + +# Create app directory +WORKDIR /app + +# Copy requirements first for better caching +COPY requirements-jobs.txt . +RUN pip install --no-cache-dir -r requirements-jobs.txt + +# Copy application code +COPY dancedeets/ ./dancedeets/ + +# Set Python path +ENV PYTHONPATH=/app + +# Default command (overridden by Cloud Run Job configuration) +CMD ["python", "-m", "dancedeets.jobs.runner"] diff --git a/server/MIGRATION_PLAN.md b/server/MIGRATION_PLAN.md index 9d452a49..126504b4 100644 --- a/server/MIGRATION_PLAN.md +++ b/server/MIGRATION_PLAN.md @@ -2,6 +2,58 @@ This document outlines the migration from legacy App Engine MapReduce/Pipeline to modern Google Cloud services. +## Migration Progress + +| Phase | Status | Jobs Migrated | +|-------|--------|---------------| +| Phase 1: Infrastructure | ✅ COMPLETE | Framework, Dockerfile, requirements | +| Phase 2: Simple Mapper Jobs | ✅ COMPLETE | 6/6 jobs | +| Phase 3: GCS Output Jobs | ✅ COMPLETE | 5/5 jobs | +| Phase 4: MapReduce Pipeline Jobs | ✅ COMPLETE | 3/4 jobs (find_access_tokens pending) | +| Phase 5: Cloud Workflows | ✅ COMPLETE | 1 workflow + 3 jobs | + +### New Files Created + +**Framework (`server/dancedeets/jobs/`):** +- `__init__.py` - Module exports +- `base.py` - Job, BatchJob, JobRunner classes +- `fb_utils.py` - Facebook API token handling +- `metrics.py` - JobMetrics, GroupedMetrics +- `gcs_output.py` - GCSOutputWriter +- `runner.py` - CLI entry point + +**Phase 2 Jobs:** +- `notify_users.py` - Push notifications by timezone +- `post_japan_events.py` - Post Japan events to social +- `compute_rankings.py` - City/country rankings +- `compute_user_stats.py` - User event statistics +- `refresh_users.py` - Refresh Facebook profiles +- `send_weekly_emails.py` - Weekly digest emails + +**Phase 3 Jobs:** +- `generate_sitemaps.py` - XML sitemap generation +- `dump_potential_events.py` - Export to CSV +- `generate_training_data.py` - ML training data +- `classify_events_ml.py` - ML classification +- `auto_add_events.py` - Auto-add dance events + +**Phase 4 Jobs:** +- `count_unique_attendees.py` - Unique RSVPs by city +- `update_source_stats.py` - Source quality metrics +- `scrape_and_classify.py` - Scrape and classify events + +**Phase 5 (Cloud Workflows):** +- `workflows/crawl_and_index_classes.yaml` - Orchestration workflow +- `start_spiders.py` - Start ScrapingHub spiders +- `reindex_classes.py` - Rebuild class search index +- `email_crawl_errors.py` - Send error reports + +**Docker/Config:** +- `Dockerfile.jobs` - Cloud Run Jobs container +- `requirements-jobs.txt` - Job dependencies + +--- + ## Migration Strategy | Legacy Pattern | Modern Replacement | diff --git a/server/dancedeets/jobs/__init__.py b/server/dancedeets/jobs/__init__.py new file mode 100644 index 00000000..73f47354 --- /dev/null +++ b/server/dancedeets/jobs/__init__.py @@ -0,0 +1,51 @@ +# Cloud Run Jobs framework for DanceDeets +# Replaces legacy App Engine MapReduce/Pipeline + +from .base import Job, JobRunner, BatchJob +from .fb_utils import get_fblookup, get_fblookup_params, get_multiple_tokens, FBJobContext +from .metrics import JobMetrics, GroupedMetrics +from .gcs_output import GCSOutputWriter + +__all__ = [ + # Base classes + 'Job', + 'JobRunner', + 'BatchJob', + # Facebook utilities + 'FBJobContext', + 'get_fblookup', + 'get_fblookup_params', + 'get_multiple_tokens', + # Metrics + 'JobMetrics', + 'GroupedMetrics', + # GCS output + 'GCSOutputWriter', +] + +# Available jobs (for reference): +# +# Phase 2 - Simple mapper jobs: +# - notify_users: Send push notifications by timezone +# - post_japan_events: Post Japan events to social media +# - compute_rankings: Compute city/country rankings +# - compute_user_stats: Compute user event statistics +# - refresh_users: Refresh user profiles from Facebook +# - send_weekly_emails: Send weekly event digest emails +# +# Phase 3 - GCS output jobs: +# - generate_sitemaps: Generate XML sitemaps +# - dump_potential_events: Export potential events to CSV +# - generate_training_data: Generate ML training data +# - classify_events_ml: ML event classification +# - auto_add_events: Auto-add dance events +# +# Phase 4 - MapReduce pipeline replacements: +# - count_unique_attendees: Count unique RSVPs by city +# - update_source_stats: Update source quality metrics +# - scrape_and_classify: Scrape sources and classify events +# +# Phase 5 - Pipeline orchestration (Cloud Workflows): +# - start_spiders: Start ScrapingHub spider jobs +# - reindex_classes: Rebuild class search index +# - email_crawl_errors: Send crawl error reports diff --git a/server/dancedeets/jobs/auto_add_events.py b/server/dancedeets/jobs/auto_add_events.py new file mode 100644 index 00000000..56d8b05d --- /dev/null +++ b/server/dancedeets/jobs/auto_add_events.py @@ -0,0 +1,300 @@ +""" +Cloud Run Job: Automatically add dance events to the database. + +Migrated from: dancedeets/event_scraper/auto_add.py + +This job classifies potential events using NLP and attendee analysis, +and automatically adds those that qualify as dance events. + +Usage: + python -m dancedeets.jobs.runner --job=auto_add_events + python -m dancedeets.jobs.runner --job=auto_add_events --dancey_only=true +""" + +import datetime +import logging +import re +from typing import List, Optional + +from dancedeets import fb_api +from dancedeets.event_attendees import event_attendee_classifier +from dancedeets.events import eventdata +from dancedeets.jobs.base import BatchJob, JobRunner +from dancedeets.jobs.fb_utils import FBJobContext, get_multiple_tokens +from dancedeets.jobs.gcs_output import GCSOutputWriter, DEFAULT_BUCKET +from dancedeets.jobs.metrics import JobMetrics, set_current_metrics +from dancedeets.nlp import event_auto_classifier +from dancedeets.nlp import event_classifier +from dancedeets.nlp.styles import street +from dancedeets.event_scraper import add_entities +from dancedeets.event_scraper import potential_events + +logger = logging.getLogger(__name__) + + +class AutoAddEventsJob(BatchJob): + """ + Job that automatically classifies and adds dance events. + + For each potential event: + 1. Fetch event data from Facebook + 2. Run NLP classifier on event text + 3. If text doesn't match, check attendee profiles + 4. Add qualifying events to the database + """ + + def __init__( + self, + fb_context: Optional[FBJobContext] = None, + bucket_name: str = DEFAULT_BUCKET, + allow_posting: bool = True, + dry_run: bool = False, + ): + # Use small batch size to avoid timeouts (complex classification) + super().__init__(batch_size=10) + self.fb_context = fb_context + self.bucket_name = bucket_name + self.allow_posting = allow_posting + self.dry_run = dry_run + self.output_writer = None + logger.info("AutoAddEventsJob initialized") + + def setup(self) -> None: + """Initialize the output writer.""" + if not self.dry_run: + self.output_writer = GCSOutputWriter( + bucket_name=self.bucket_name, + blob_name='auto_add/results.txt', + content_type='text/plain', + ) + + def run_batch(self, pe_list: list) -> None: + """Process a batch of potential events.""" + if not self.fb_context: + logger.warning("No FB context, skipping batch") + self.metrics.increment('batches_skipped_no_fb') + return + + fbl = self.fb_context.get_fblookup() + + # Fetch event data from Facebook + fb_list = fbl.get_multi( + fb_api.LookupEvent, + [x.fb_event_id for x in pe_list], + allow_fail=True, + ) + + # Filter and classify events + results = self._classify_events(fbl, pe_list, fb_list) + + # Write results + if results and not self.dry_run: + for result in results: + self.output_writer.write(result) + + self.metrics.increment('batches_processed') + + def _classify_events(self, fbl, pe_list: list, fb_list: list) -> List[str]: + """ + Filter and classify events. + + Returns: + List of result strings for successfully added events + """ + new_pe_list = [] + new_fb_list = [] + + for pe, fb_event in zip(pe_list, fb_list): + # Handle past events + if pe.set_past_event(fb_event): + if not self.dry_run: + pe.put() + + if not fb_event or fb_event.get('empty'): + self.metrics.increment('skip-due-to-empty') + continue + + # Skip already processed events + if pe.looked_at: + logger.debug('Already looked at event, skipping') + self.metrics.increment('skip-due-to-looked-at') + continue + + event_id = pe.fb_event_id + if not re.match(r'^\d+$', event_id): + logger.error(f'Found a very strange potential event id: {event_id}') + self.metrics.increment('skip-due-to-bad-id') + continue + + new_pe_list.append(pe) + new_fb_list.append(fb_event) + + return self._really_classify_events(fbl, new_pe_list, new_fb_list) + + def _really_classify_events(self, fbl, pe_list: list, fb_list: list) -> List[str]: + """ + Actually classify events and add qualifying ones. + + Returns: + List of result strings + """ + if not fb_list: + return [] + + if not pe_list: + pe_list = [None] * len(fb_list) + + logger.info(f'Classifying {len(fb_list)} events') + + # Fetch attendee data + fb_event_ids = [x['info']['id'] for x in fb_list] + fb_attending_maybe_list = fbl.get_multi( + fb_api.LookupEventAttendingMaybe, + fb_event_ids, + allow_fail=True, + ) + + results = [] + for pe, fb_event, fb_event_attending_maybe in zip(pe_list, fb_list, fb_attending_maybe_list): + event_id = fb_event['info']['id'] + logger.debug(f'Classifying event {event_id}') + + # Run text classifier + classified_event = event_classifier.get_classified_event(fb_event) + auto_add_result = event_auto_classifier.is_auto_add_event(classified_event) + logger.debug(f'Text classification result: {auto_add_result}') + + good_event = False + method = None + verticals = [] + + if auto_add_result.is_good_event(): + good_event = True + method = eventdata.CM_AUTO + verticals = auto_add_result.verticals() + elif fb_event_attending_maybe: + # Try attendee-based classification + logger.debug(f'Trying attendee classification for {event_id}') + good_event = event_attendee_classifier.is_good_event_by_attendees( + fbl, + fb_event, + fb_event_attending_maybe=fb_event_attending_maybe, + classified_event=classified_event, + ) + logger.debug(f'Attendee classification result: {good_event}') + method = eventdata.CM_AUTO_ATTENDEE + verticals = [street.Style.get_name()] + + if good_event: + result = f"+{event_id}\t{fb_event['info'].get('name', '')}\n" + + if self.dry_run: + logger.info(f"[DRY RUN] Would add event {event_id}") + self.metrics.increment('events_would_add') + results.append(result) + continue + + try: + invite_ids = pe.get_invite_uids() if pe else [] + logger.info(f'Adding event {event_id}, invite_ids: {invite_ids}') + + e = add_entities.add_update_fb_event( + fb_event, + fbl, + visible_to_fb_uids=invite_ids, + creating_method=method, + allow_posting=self.allow_posting, + verticals=verticals, + ) + + # Mark as processed + pe2 = potential_events.PotentialEvent.get_by_key_name(event_id) + pe2.looked_at = True + pe2.auto_looked_at = True + pe2.put() + + results.append(result) + self.metrics.increment('auto-added-dance-events') + + # Track by time period + if e.start_time < datetime.datetime.now(): + self.metrics.increment('auto-added-dance-events-past') + else: + self.metrics.increment('auto-added-dance-events-future') + + # Track by vertical + for vertical in e.verticals: + self.metrics.increment(f'auto-added-vertical-{vertical}') + + except fb_api.NoFetchedDataException as e: + logger.error(f"Error adding event {event_id}, no fetched data: {e}") + self.metrics.increment('events_failed_no_data') + except add_entities.AddEventException as e: + logger.warning(f"Error adding event {event_id}: {e}") + self.metrics.increment('events_failed_add') + + return results + + def teardown(self) -> None: + """Finalize the output.""" + if not self.dry_run and self.output_writer: + uri = self.output_writer.flush() + logger.info(f"Results written to {uri}") + + +def main( + dancey_only: bool = False, + past_event: bool = None, + dry_run: bool = False, + **kwargs, +) -> None: + """ + Main entry point for the auto_add_events job. + + Args: + dancey_only: If True, only process events with should_look_at=True + past_event: Filter by past_event status (True/False/None for all) + dry_run: If True, don't actually add events + """ + logger.info(f"Starting auto_add_events job: dancey_only={dancey_only}, past_event={past_event}") + + # Get tokens for Facebook API access + try: + tokens = get_multiple_tokens(token_count=50) + logger.info(f"Got {len(tokens)} access tokens for rotation") + except Exception as e: + logger.warning(f"Could not get multiple tokens: {e}") + tokens = [] + + fb_context = FBJobContext( + fb_uid='system', + access_tokens=tokens, + allow_cache=True, + ) if tokens else None + + job = AutoAddEventsJob( + fb_context=fb_context, + dry_run=dry_run, + ) + set_current_metrics(job.metrics) + + runner = JobRunner(job) + + # Build filters + filters = [] + if dancey_only: + filters.append(('should_look_at', '=', True)) + if past_event is not None: + filters.append(('past_event', '=', past_event)) + + runner.run_from_datastore_batched( + entity_kind='dancedeets.event_scraper.potential_events.PotentialEvent', + filters=filters, + batch_size=10, + ) + + job.metrics.log_summary() + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/base.py b/server/dancedeets/jobs/base.py new file mode 100644 index 00000000..8834f9f4 --- /dev/null +++ b/server/dancedeets/jobs/base.py @@ -0,0 +1,379 @@ +""" +Base classes for Cloud Run Jobs. + +This module provides the foundation for running batch jobs that replace +the legacy App Engine MapReduce functionality. + +Cloud Run Jobs are containerized batch tasks that: +- Run to completion (not request-response like services) +- Support parallel execution via CLOUD_RUN_TASK_INDEX +- Can run up to 24 hours +- Support automatic retries + +Usage: + class MyJob(Job): + def run(self, entity): + # Process a single entity + pass + + if __name__ == '__main__': + runner = JobRunner(MyJob()) + runner.run_from_datastore('dancedeets.events.eventdata.DBEvent') +""" + +import abc +import logging +import os +import sys +from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Type + +from google.cloud import datastore +from google.cloud import storage + +from .metrics import JobMetrics + +# Configure logging for Cloud Run Jobs +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + stream=sys.stdout, +) +logger = logging.getLogger(__name__) + + +class Job(abc.ABC): + """Base class for all Cloud Run Jobs.""" + + def __init__(self): + self.metrics = JobMetrics() + self._gcs_client: Optional[storage.Client] = None + self._datastore_client: Optional[datastore.Client] = None + + @property + def gcs_client(self) -> storage.Client: + """Lazy-loaded GCS client.""" + if self._gcs_client is None: + self._gcs_client = storage.Client() + return self._gcs_client + + @property + def datastore_client(self) -> datastore.Client: + """Lazy-loaded Datastore client.""" + if self._datastore_client is None: + self._datastore_client = datastore.Client() + return self._datastore_client + + @abc.abstractmethod + def run(self, entity: Any) -> Optional[Any]: + """ + Process a single entity. + + Args: + entity: The entity to process (from Datastore query) + + Returns: + Optional output to be collected (for jobs with output) + """ + pass + + def setup(self) -> None: + """Called once before processing entities. Override for initialization.""" + pass + + def teardown(self) -> None: + """Called once after all entities processed. Override for cleanup.""" + pass + + def on_batch_complete(self, batch: List[Any]) -> None: + """Called after processing a batch of entities. Override for batch operations.""" + pass + + +class BatchJob(Job): + """Job that processes entities in batches instead of one at a time.""" + + def __init__(self, batch_size: int = 20): + super().__init__() + self.batch_size = batch_size + + def run(self, entity: Any) -> Optional[Any]: + """Not used for batch jobs - override run_batch instead.""" + raise NotImplementedError("BatchJob should override run_batch, not run") + + @abc.abstractmethod + def run_batch(self, entities: List[Any]) -> Optional[List[Any]]: + """ + Process a batch of entities. + + Args: + entities: List of entities to process + + Returns: + Optional list of outputs to be collected + """ + pass + + +class JobRunner: + """ + Runs a Job against a set of entities. + + Supports: + - Datastore entity iteration with cursor-based pagination + - Parallel task execution via CLOUD_RUN_TASK_INDEX + - Output collection to GCS + - Progress logging + """ + + def __init__(self, job: Job, project_id: Optional[str] = None): + self.job = job + self.project_id = project_id or os.environ.get('GOOGLE_CLOUD_PROJECT', 'dancedeets-hrd') + self._datastore_client: Optional[datastore.Client] = None + + # Cloud Run Job environment variables + self.task_index = int(os.environ.get('CLOUD_RUN_TASK_INDEX', '0')) + self.task_count = int(os.environ.get('CLOUD_RUN_TASK_COUNT', '1')) + self.attempt_index = int(os.environ.get('CLOUD_RUN_TASK_ATTEMPT', '0')) + + logger.info( + f"JobRunner initialized: task {self.task_index + 1}/{self.task_count}, " + f"attempt {self.attempt_index + 1}" + ) + + @property + def datastore_client(self) -> datastore.Client: + """Lazy-loaded Datastore client.""" + if self._datastore_client is None: + self._datastore_client = datastore.Client(project=self.project_id) + return self._datastore_client + + def run_from_datastore( + self, + entity_kind: str, + filters: Optional[List[tuple]] = None, + batch_size: int = 100, + limit: Optional[int] = None, + ) -> None: + """ + Run the job against entities from Datastore. + + Args: + entity_kind: Full entity kind path (e.g., 'dancedeets.events.eventdata.DBEvent') + filters: Optional list of (property, operator, value) tuples + batch_size: Number of entities to fetch per query + limit: Optional maximum number of entities to process + """ + filters = filters or [] + + # Extract just the kind name (last part of the dotted path) + kind_name = entity_kind.split('.')[-1] + + logger.info(f"Starting job for entity kind: {kind_name}") + logger.info(f"Filters: {filters}") + + self.job.setup() + + try: + processed_count = 0 + output_buffer: List[Any] = [] + + for entity in self._iterate_entities(kind_name, filters, batch_size, limit): + try: + if isinstance(self.job, BatchJob): + # Batch jobs handle their own batching in _iterate_entities_batched + raise NotImplementedError("Use run_from_datastore_batched for BatchJob") + + result = self.job.run(entity) + if result is not None: + if isinstance(result, (list, tuple)): + output_buffer.extend(result) + else: + output_buffer.append(result) + + processed_count += 1 + self.job.metrics.increment('entities_processed') + + if processed_count % 100 == 0: + logger.info(f"Processed {processed_count} entities") + + except Exception as e: + logger.error(f"Error processing entity {entity.key}: {e}") + self.job.metrics.increment('entities_failed') + # Continue processing other entities + + logger.info(f"Job complete. Processed {processed_count} entities.") + logger.info(f"Metrics: {self.job.metrics.get_all()}") + + finally: + self.job.teardown() + + def run_from_datastore_batched( + self, + entity_kind: str, + filters: Optional[List[tuple]] = None, + batch_size: int = 20, + limit: Optional[int] = None, + ) -> None: + """ + Run a BatchJob against entities from Datastore. + + Args: + entity_kind: Full entity kind path + filters: Optional list of (property, operator, value) tuples + batch_size: Number of entities per batch (overrides job.batch_size) + limit: Optional maximum number of entities to process + """ + if not isinstance(self.job, BatchJob): + raise TypeError("run_from_datastore_batched requires a BatchJob") + + filters = filters or [] + kind_name = entity_kind.split('.')[-1] + + logger.info(f"Starting batch job for entity kind: {kind_name}") + logger.info(f"Batch size: {batch_size}, Filters: {filters}") + + self.job.setup() + + try: + processed_count = 0 + batch: List[Any] = [] + + for entity in self._iterate_entities(kind_name, filters, batch_size, limit): + batch.append(entity) + + if len(batch) >= batch_size: + self._process_batch(batch) + processed_count += len(batch) + batch = [] + + if processed_count % 100 == 0: + logger.info(f"Processed {processed_count} entities") + + # Process remaining entities + if batch: + self._process_batch(batch) + processed_count += len(batch) + + logger.info(f"Batch job complete. Processed {processed_count} entities.") + logger.info(f"Metrics: {self.job.metrics.get_all()}") + + finally: + self.job.teardown() + + def _process_batch(self, batch: List[Any]) -> None: + """Process a batch of entities.""" + try: + self.job.run_batch(batch) + self.job.metrics.increment('batches_processed') + self.job.metrics.increment('entities_processed', len(batch)) + except Exception as e: + logger.error(f"Error processing batch: {e}") + self.job.metrics.increment('batches_failed') + self.job.metrics.increment('entities_failed', len(batch)) + + def _iterate_entities( + self, + kind: str, + filters: List[tuple], + batch_size: int, + limit: Optional[int], + ) -> Generator[Any, None, None]: + """ + Iterate over Datastore entities with cursor-based pagination. + + For parallel Cloud Run Jobs, entities are distributed across tasks + using modulo on the entity key. + """ + query = self.datastore_client.query(kind=kind) + + for prop, op, value in filters: + query.add_filter(prop, op, value) + + cursor = None + total_fetched = 0 + + while True: + # Fetch a page of results + query_iter = query.fetch(start_cursor=cursor, limit=batch_size) + page = list(query_iter) + + if not page: + break + + for entity in page: + # For parallel execution, only process entities assigned to this task + if self.task_count > 1: + # Use hash of key for distribution + entity_hash = hash(str(entity.key)) + if entity_hash % self.task_count != self.task_index: + continue + + yield entity + total_fetched += 1 + + if limit and total_fetched >= limit: + return + + # Get cursor for next page + cursor = query_iter.next_page_token + if cursor is None: + break + + def write_output_to_gcs( + self, + output_lines: Iterable[str], + bucket_name: str, + blob_name: str, + content_type: str = 'text/plain', + ) -> str: + """ + Write output lines to Google Cloud Storage. + + Args: + output_lines: Iterable of strings to write + bucket_name: GCS bucket name + blob_name: Path within the bucket + content_type: MIME type of the output + + Returns: + GCS URI of the written file + """ + gcs_client = storage.Client() + bucket = gcs_client.bucket(bucket_name) + + # Include task index in filename for parallel jobs + if self.task_count > 1: + name, ext = os.path.splitext(blob_name) + blob_name = f"{name}-{self.task_index:05d}{ext}" + + blob = bucket.blob(blob_name) + + # Write as a single string + content = '\n'.join(output_lines) + blob.upload_from_string(content, content_type=content_type) + + uri = f"gs://{bucket_name}/{blob_name}" + logger.info(f"Wrote output to {uri}") + return uri + + +def run_job( + job_class: Type[Job], + entity_kind: str, + filters: Optional[List[tuple]] = None, + **kwargs, +) -> None: + """ + Convenience function to run a job. + + This is the main entry point for Cloud Run Job containers. + + Args: + job_class: The Job class to instantiate and run + entity_kind: Datastore entity kind to process + filters: Optional query filters + **kwargs: Additional arguments passed to run_from_datastore + """ + job = job_class() + runner = JobRunner(job) + runner.run_from_datastore(entity_kind, filters=filters, **kwargs) diff --git a/server/dancedeets/jobs/classify_events_ml.py b/server/dancedeets/jobs/classify_events_ml.py new file mode 100644 index 00000000..d3b27a44 --- /dev/null +++ b/server/dancedeets/jobs/classify_events_ml.py @@ -0,0 +1,188 @@ +""" +Cloud Run Job: Classify potential events using ML prediction. + +Migrated from: dancedeets/ml/mr_prediction.py + +This job uses a trained ML model to classify potential events and +score them as likely dance events. + +Usage: + python -m dancedeets.jobs.runner --job=classify_events_ml +""" + +import logging +from typing import Optional + +from dancedeets import fb_api +from dancedeets.event_scraper import potential_events +from dancedeets.jobs.base import BatchJob, JobRunner +from dancedeets.jobs.fb_utils import FBJobContext, get_multiple_tokens +from dancedeets.jobs.gcs_output import GCSOutputWriter, DEFAULT_BUCKET +from dancedeets.jobs.metrics import JobMetrics, set_current_metrics +from dancedeets.ml import gprediction + +logger = logging.getLogger(__name__) + + +class ClassifyEventsMLJob(BatchJob): + """ + Job that classifies potential events using ML prediction. + + Uses Google Prediction API to score events as likely dance events. + """ + + def __init__( + self, + fb_context: Optional[FBJobContext] = None, + bucket_name: str = DEFAULT_BUCKET, + dry_run: bool = False, + ): + super().__init__(batch_size=20) + self.fb_context = fb_context + self.bucket_name = bucket_name + self.dry_run = dry_run + self.output_writer = None + self._predict_service = None + logger.info("ClassifyEventsMLJob initialized") + + @property + def predict_service(self): + """Lazy-loaded prediction service.""" + if self._predict_service is None: + self._predict_service = gprediction.get_predict_service() + return self._predict_service + + def setup(self) -> None: + """Initialize the output writer.""" + if not self.dry_run: + self.output_writer = GCSOutputWriter( + bucket_name=self.bucket_name, + blob_name='ml/classification_results.txt', + content_type='text/plain', + ) + + def run_batch(self, pe_list: list) -> None: + """Process a batch of potential events.""" + # Filter to events with match_score > 0 + pe_list = [x for x in pe_list if x.match_score > 0] + if not pe_list: + self.metrics.increment('batches_empty') + return + + if not self.fb_context: + logger.warning("No FB context, skipping batch") + self.metrics.increment('batches_skipped_no_fb') + return + + fbl = self.fb_context.get_fblookup() + + # Get events that don't already have scores + pe_ids = [x.fb_event_id for x in pe_list if not getattr(x, 'dance_bias_score', None)] + if pe_ids: + fbl.request_multi(fb_api.LookupEvent, pe_ids) + fbl.request_multi(fb_api.LookupEventAttending, pe_ids) + + try: + fbl.batch_fetch() + except Exception as e: + logger.error(f"Error fetching Facebook data: {e}") + self.metrics.increment('batches_failed_fb') + return + + results = [] + for pe in pe_list: + if not getattr(pe, 'dance_bias_score', None): + try: + fb_event = fbl.fetched_data(fb_api.LookupEvent, pe.fb_event_id) + fb_event_attending = fbl.fetched_data(fb_api.LookupEventAttending, pe.fb_event_id) + except fb_api.NoFetchedDataException: + self.metrics.increment('events_skipped_no_data') + continue + + if fb_event.get('empty'): + self.metrics.increment('events_skipped_empty') + continue + + # Score the event + if self.dry_run: + logger.info(f"[DRY RUN] Would classify event {pe.fb_event_id}") + self.metrics.increment('events_would_classify') + else: + pe = potential_events.update_scores_for_potential_event( + pe, fb_event, fb_event_attending, self.predict_service + ) + self.metrics.increment('events_classified') + + logger.debug( + f"{pe.fb_event_id}: ms={pe.match_score}, " + f"d={pe.dance_bias_score}, nd={pe.non_dance_bias_score}" + ) + + # Report events with high scores in both models + if (getattr(pe, 'dance_bias_score', 0) or 0) > 0.5 and \ + (getattr(pe, 'non_dance_bias_score', 0) or 0) > 0.5: + result = f"{pe.fb_event_id}:{pe.match_score}:{pe.dance_bias_score}:{pe.non_dance_bias_score}\n" + results.append(result) + self.metrics.increment('events_high_score') + + # Write results + if results and not self.dry_run: + for result in results: + self.output_writer.write(result) + + self.metrics.increment('batches_processed') + + def teardown(self) -> None: + """Finalize the output.""" + if not self.dry_run and self.output_writer: + uri = self.output_writer.flush() + logger.info(f"Classification results written to {uri}") + + +def main(dry_run: bool = False, **kwargs) -> None: + """ + Main entry point for the classify_events_ml job. + + Args: + dry_run: If True, don't actually classify or save + """ + logger.info("Starting classify_events_ml job") + + # Get tokens for Facebook API access + try: + tokens = get_multiple_tokens(token_count=50) + logger.info(f"Got {len(tokens)} access tokens for rotation") + except Exception as e: + logger.warning(f"Could not get multiple tokens: {e}") + tokens = [] + + fb_context = FBJobContext( + fb_uid='system', + access_tokens=tokens, + allow_cache=True, + ) if tokens else None + + job = ClassifyEventsMLJob( + fb_context=fb_context, + dry_run=dry_run, + ) + set_current_metrics(job.metrics) + + runner = JobRunner(job) + + # Only process events that haven't been looked at + filters = [ + ('looked_at', '=', None), + ] + + runner.run_from_datastore_batched( + entity_kind='dancedeets.event_scraper.potential_events.PotentialEvent', + filters=filters, + batch_size=20, + ) + + job.metrics.log_summary() + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/compute_rankings.py b/server/dancedeets/jobs/compute_rankings.py new file mode 100644 index 00000000..54655501 --- /dev/null +++ b/server/dancedeets/jobs/compute_rankings.py @@ -0,0 +1,219 @@ +""" +Cloud Run Job: Compute city/country rankings by events and users. + +Migrated from: dancedeets/rankings/rankings.py + +This job counts events and users by city/country for ranking calculations. +Results are stored in memcache for display on the website. + +Usage: + python -m dancedeets.jobs.runner --job=compute_rankings --ranking_type=events --vertical=STREET + python -m dancedeets.jobs.runner --job=compute_rankings --ranking_type=users +""" + +import datetime +import logging + +from dancedeets.jobs.base import Job, JobRunner +from dancedeets.jobs.metrics import GroupedMetrics, JobMetrics, set_current_metrics +from dancedeets.rankings import rankings +from dancedeets.util import memcache + +logger = logging.getLogger(__name__) + +# Time period definitions (from rankings.py) +LAST_WEEK = "LAST_WEEK" +LAST_MONTH = "LAST_MONTH" +ALL_TIME = "ALL_TIME" + + +def get_time_periods(timestamp): + """Get applicable time periods for a given timestamp.""" + now = datetime.datetime.now() + if timestamp > now - datetime.timedelta(days=7): + yield LAST_WEEK + if timestamp > now - datetime.timedelta(days=31): + yield LAST_MONTH + yield ALL_TIME + + +class ComputeEventRankingsJob(Job): + """ + Job that counts events by city for rankings. + + Iterates over all events (optionally filtered by vertical) and + counts them by city and country for different time periods. + """ + + def __init__(self, vertical: str = None, dry_run: bool = False): + super().__init__() + self.vertical = vertical + self.dry_run = dry_run + self.city_counts = GroupedMetrics() + self.country_counts = GroupedMetrics() + logger.info(f"ComputeEventRankingsJob initialized for vertical={vertical}") + + def run(self, dbevent) -> None: + """Process a single event.""" + if not dbevent.start_time: # deleted event, don't count + self.metrics.increment('events_skipped_deleted') + return + + if not dbevent.latitude or not dbevent.longitude: + self.metrics.increment('events_skipped_no_location') + return + + city = dbevent.city_name + country = dbevent.country + + # Determine which time periods this event counts for + timestamp = dbevent.creation_time or dbevent.start_time + for time_period in get_time_periods(timestamp): + if city: + self.city_counts.increment(city, time_period) + if country: + self.country_counts.increment(country, time_period) + + self.metrics.increment('events_processed') + + def teardown(self) -> None: + """Save rankings to memcache after processing.""" + if self.dry_run: + logger.info("[DRY RUN] Would save rankings to memcache") + logger.info(f"City counts: {len(self.city_counts.get_all_groups())} cities") + logger.info(f"Country counts: {len(self.country_counts.get_all_groups())} countries") + return + + # Store city rankings + city_rankings = {} + for city, periods in self.city_counts.get_all_groups().items(): + city_rankings[city] = periods + + # Store country rankings + country_rankings = {} + for country, periods in self.country_counts.get_all_groups().items(): + country_rankings[country] = periods + + # Save to memcache (similar to _compute_summary) + vertical_key = f":{self.vertical}" if self.vertical else "" + memcache.set( + f"CityEventRankings{vertical_key}", + city_rankings, + rankings.TOTALS_EXPIRY, + ) + memcache.set( + f"CountryEventRankings{vertical_key}", + country_rankings, + rankings.TOTALS_EXPIRY, + ) + + logger.info(f"Saved rankings for {len(city_rankings)} cities, {len(country_rankings)} countries") + + # Update the totals summary + total_events = sum( + periods.get(ALL_TIME, 0) + for periods in city_rankings.values() + ) + logger.info(f"Total events (all time): {total_events}") + + +class ComputeUserRankingsJob(Job): + """ + Job that counts users by city for rankings. + + Iterates over all users and counts them by city for different time periods. + """ + + def __init__(self, dry_run: bool = False): + super().__init__() + self.dry_run = dry_run + self.city_counts = GroupedMetrics() + logger.info("ComputeUserRankingsJob initialized") + + def run(self, user) -> None: + """Process a single user.""" + user_city = user.city_name + if not user_city: + self.metrics.increment('users_skipped_no_city') + return + + timestamp = user.creation_time + if not timestamp: + # Use ALL_TIME if no creation time + self.city_counts.increment(user_city, ALL_TIME) + else: + for time_period in get_time_periods(timestamp): + self.city_counts.increment(user_city, time_period) + + self.metrics.increment('users_processed') + + def teardown(self) -> None: + """Save rankings to memcache after processing.""" + if self.dry_run: + logger.info("[DRY RUN] Would save user rankings to memcache") + logger.info(f"City counts: {len(self.city_counts.get_all_groups())} cities") + return + + # Store city rankings + city_rankings = {} + for city, periods in self.city_counts.get_all_groups().items(): + city_rankings[city] = periods + + memcache.set( + "CityUserRankings", + city_rankings, + rankings.TOTALS_EXPIRY, + ) + + logger.info(f"Saved user rankings for {len(city_rankings)} cities") + + # Update the totals summary + total_users = sum( + periods.get(ALL_TIME, 0) + for periods in city_rankings.values() + ) + logger.info(f"Total users (all time): {total_users}") + + +def main( + ranking_type: str = 'events', + vertical: str = None, + dry_run: bool = False, + **kwargs, +) -> None: + """ + Main entry point for the compute_rankings job. + + Args: + ranking_type: 'events' or 'users' + vertical: Optional vertical filter (e.g., 'STREET') for events + dry_run: If True, don't save to memcache + """ + logger.info(f"Starting compute_rankings job: type={ranking_type}, vertical={vertical}") + + if ranking_type == 'events': + job = ComputeEventRankingsJob(vertical=vertical, dry_run=dry_run) + entity_kind = 'dancedeets.events.eventdata.DBEvent' + filters = [] + if vertical: + filters.append(('verticals', '=', vertical)) + elif ranking_type == 'users': + job = ComputeUserRankingsJob(dry_run=dry_run) + entity_kind = 'dancedeets.users.users.User' + filters = [] + else: + raise ValueError(f"Unknown ranking_type: {ranking_type}") + + set_current_metrics(job.metrics) + runner = JobRunner(job) + + runner.run_from_datastore( + entity_kind=entity_kind, + filters=filters, + ) + + job.metrics.log_summary() + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/compute_user_stats.py b/server/dancedeets/jobs/compute_user_stats.py new file mode 100644 index 00000000..b7ba4762 --- /dev/null +++ b/server/dancedeets/jobs/compute_user_stats.py @@ -0,0 +1,151 @@ +""" +Cloud Run Job: Compute user event statistics. + +Migrated from: dancedeets/users/user_event_tasks.py + +This job calculates statistics for each user about how many events +they've contributed (auto-added, hand-added, etc.). + +Usage: + python -m dancedeets.jobs.runner --job=compute_user_stats +""" + +import logging + +from dancedeets.jobs.base import Job, JobRunner +from dancedeets.jobs.metrics import JobMetrics, set_current_metrics +from dancedeets.events import eventdata +from dancedeets.event_scraper import potential_events + +logger = logging.getLogger(__name__) + + +def update_user_qualities(user) -> dict: + """ + Calculate and update user event contribution statistics. + + Returns: + Dict with the calculated statistics + """ + # Query potential events where this user is a source + # STR_ID_MIGRATE: using long() for fb_uid + try: + fb_uid_long = int(user.fb_uid) + except (ValueError, TypeError): + fb_uid_long = user.fb_uid + + source_potential_events = potential_events.PotentialEvent.gql( + 'WHERE source_ids = :graph_id', + graph_id=fb_uid_long + ).fetch(1000) + + # Get the actual events that were added + added_events = eventdata.DBEvent.get_by_ids( + [x.fb_event_id for x in source_potential_events] + ) + + # Count auto-added events + num_auto_added = len([ + x for x in added_events + if x and x.creating_method in [eventdata.CM_AUTO, eventdata.CM_AUTO_ATTENDEE] + ]) + + # Count auto-added events owned by this user + num_auto_added_own = len([ + x for x in added_events + if x and x.creating_method in [eventdata.CM_AUTO, eventdata.CM_AUTO_ATTENDEE] + and x.owner_fb_uid == user.fb_uid + ]) + + # Count hand-added events (created by this user) + # STR_ID_MIGRATE + num_hand_added = len([ + x for x in added_events + if x and x.creating_method == eventdata.CM_USER + and str(x.creating_fb_uid) == user.fb_uid + ]) + + # Count hand-added events owned by this user + # STR_ID_MIGRATE + num_hand_added_own = len([ + x for x in added_events + if x and x.creating_method == eventdata.CM_USER + and str(x.creating_fb_uid) == user.fb_uid + and x.owner_fb_uid == user.fb_uid + ]) + + # Update user properties + user.num_auto_added_events = num_auto_added + user.num_auto_added_own_events = num_auto_added_own + user.num_hand_added_events = num_hand_added + user.num_hand_added_own_events = num_hand_added_own + + return { + 'auto_added': num_auto_added, + 'auto_added_own': num_auto_added_own, + 'hand_added': num_hand_added, + 'hand_added_own': num_hand_added_own, + } + + +class ComputeUserStatsJob(Job): + """ + Job that computes event statistics for each user. + + For each user, counts: + - Auto-added events (via ML classifier) + - Hand-added events (manually added by user) + - Events owned by the user + """ + + def __init__(self, dry_run: bool = False): + super().__init__() + self.dry_run = dry_run + logger.info("ComputeUserStatsJob initialized") + + def run(self, user) -> None: + """Process a single user.""" + try: + stats = update_user_qualities(user) + + if self.dry_run: + logger.info( + f"[DRY RUN] User {user.fb_uid}: " + f"auto={stats['auto_added']}, hand={stats['hand_added']}" + ) + self.metrics.increment('users_would_update') + else: + user.put() + self.metrics.increment('users_updated') + + # Track totals + self.metrics.increment('total_auto_added', stats['auto_added']) + self.metrics.increment('total_hand_added', stats['hand_added']) + + except Exception as e: + logger.error(f"Error processing user {user.fb_uid}: {e}") + self.metrics.increment('users_failed') + + +def main(dry_run: bool = False, **kwargs) -> None: + """ + Main entry point for the compute_user_stats job. + + Args: + dry_run: If True, don't save changes to users + """ + logger.info("Starting compute_user_stats job") + + job = ComputeUserStatsJob(dry_run=dry_run) + set_current_metrics(job.metrics) + + runner = JobRunner(job) + runner.run_from_datastore( + entity_kind='dancedeets.users.users.User', + ) + + job.metrics.log_summary() + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/count_unique_attendees.py b/server/dancedeets/jobs/count_unique_attendees.py new file mode 100644 index 00000000..ca7a5089 --- /dev/null +++ b/server/dancedeets/jobs/count_unique_attendees.py @@ -0,0 +1,195 @@ +""" +Cloud Run Job: Count unique attendees per city. + +Migrated from: dancedeets/logic/unique_attendees.py + +This job counts unique RSVPs across all events, grouped by city and country. + +Usage: + python -m dancedeets.jobs.runner --job=count_unique_attendees +""" + +import logging +from collections import defaultdict +from typing import Dict, Optional, Set + +from dancedeets import fb_api +from dancedeets.jobs.base import BatchJob, JobRunner +from dancedeets.jobs.fb_utils import FBJobContext, get_multiple_tokens +from dancedeets.jobs.gcs_output import GCSOutputWriter, DEFAULT_BUCKET +from dancedeets.jobs.metrics import JobMetrics, set_current_metrics + +logger = logging.getLogger(__name__) + +BATCH_SIZE = 20 + + +class CountUniqueAttendeesJob(BatchJob): + """ + Job that counts unique attendees per city/country. + + This is a "reduce" style job that aggregates attendees across all events. + Uses in-memory aggregation instead of MapReduce framework. + """ + + def __init__( + self, + fb_context: Optional[FBJobContext] = None, + bucket_name: str = DEFAULT_BUCKET, + dry_run: bool = False, + ): + super().__init__(batch_size=BATCH_SIZE) + self.fb_context = fb_context + self.bucket_name = bucket_name + self.dry_run = dry_run + self.output_writer = None + + # In-memory aggregation (replaces MapReduce reduce step) + # Maps location -> set of attendee IDs + self.city_attendees: Dict[str, Set[str]] = defaultdict(set) + self.country_attendees: Dict[str, Set[str]] = defaultdict(set) + + # For counting total RSVPs (not unique) + self.city_rsvp_count: Dict[str, int] = defaultdict(int) + self.country_rsvp_count: Dict[str, int] = defaultdict(int) + + logger.info("CountUniqueAttendeesJob initialized") + + def setup(self) -> None: + """Initialize the output writer.""" + if not self.dry_run: + self.output_writer = GCSOutputWriter( + bucket_name=self.bucket_name, + blob_name='analytics/unique_attendees.txt', + content_type='text/plain', + ) + + def run_batch(self, db_events: list) -> None: + """Process a batch of events.""" + # Filter to Facebook events only + db_events = [x for x in db_events if x.is_fb_event] + if not db_events: + self.metrics.increment('batches_empty') + return + + if not self.fb_context: + logger.warning("No FB context, skipping batch") + self.metrics.increment('batches_skipped_no_fb') + return + + fbl = self.fb_context.get_fblookup() + + # Request attending data + fbl.request_multi(fb_api.LookupEventAttending, [x.fb_event_id for x in db_events]) + + try: + fbl.batch_fetch() + except Exception as e: + logger.error(f"Error fetching Facebook data: {e}") + self.metrics.increment('batches_failed_fb') + return + + for db_event in db_events: + try: + fb_event_attending = fbl.fetched_data( + fb_api.LookupEventAttending, db_event.id + ) + except fb_api.NoFetchedDataException: + logger.warning(f'No attending found for {db_event.id}') + self.metrics.increment('events_no_attending') + continue + + if fb_event_attending.get('empty'): + self.metrics.increment('events_empty_attending') + continue + + attendees = fb_event_attending.get('attending', {}).get('data', []) + city = db_event.city_name + country = db_event.country + + for attendee in attendees: + attendee_id = attendee['id'] + + if city: + self.city_attendees[city].add(attendee_id) + self.city_rsvp_count[city] += 1 + + if country: + self.country_attendees[country].add(attendee_id) + self.country_rsvp_count[country] += 1 + + self.metrics.increment('events_processed') + self.metrics.increment('attendees_processed', len(attendees)) + + self.metrics.increment('batches_processed') + + def teardown(self) -> None: + """Write final results.""" + if self.dry_run: + logger.info("[DRY RUN] Would write attendee counts") + logger.info(f" Cities: {len(self.city_attendees)}") + logger.info(f" Countries: {len(self.country_attendees)}") + return + + # Write city results + for city in sorted(self.city_attendees.keys()): + unique_count = len(self.city_attendees[city]) + total_count = self.city_rsvp_count[city] + self.output_writer.write(f"Unique Attendees in City: {city}: {unique_count}") + self.output_writer.write(f"Total RSVPs in City: {city}: {total_count}") + + # Write country results + for country in sorted(self.country_attendees.keys()): + unique_count = len(self.country_attendees[country]) + total_count = self.country_rsvp_count[country] + self.output_writer.write(f"Unique Attendees in Country: {country}: {unique_count}") + self.output_writer.write(f"Total RSVPs in Country: {country}: {total_count}") + + uri = self.output_writer.flush() + logger.info(f"Results written to {uri}") + + # Log summary + total_unique = sum(len(s) for s in self.city_attendees.values()) + logger.info(f"Total unique attendees across all cities: {total_unique}") + + +def main(dry_run: bool = False, **kwargs) -> None: + """ + Main entry point for the count_unique_attendees job. + + Args: + dry_run: If True, don't write to GCS + """ + logger.info("Starting count_unique_attendees job") + + # Get tokens for Facebook API access + try: + tokens = get_multiple_tokens(token_count=50) + logger.info(f"Got {len(tokens)} access tokens for rotation") + except Exception as e: + logger.warning(f"Could not get multiple tokens: {e}") + tokens = [] + + fb_context = FBJobContext( + fb_uid='system', + access_tokens=tokens, + allow_cache=True, + ) if tokens else None + + job = CountUniqueAttendeesJob( + fb_context=fb_context, + dry_run=dry_run, + ) + set_current_metrics(job.metrics) + + runner = JobRunner(job) + runner.run_from_datastore_batched( + entity_kind='dancedeets.events.eventdata.DBEvent', + batch_size=BATCH_SIZE, + ) + + job.metrics.log_summary() + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/dump_potential_events.py b/server/dancedeets/jobs/dump_potential_events.py new file mode 100644 index 00000000..7bf2ab00 --- /dev/null +++ b/server/dancedeets/jobs/dump_potential_events.py @@ -0,0 +1,160 @@ +""" +Cloud Run Job: Dump potential events data to GCS. + +Migrated from: dancedeets/logic/mr_dump.py + +This job exports potential event data (from Facebook API) to CSV format +in Google Cloud Storage for analysis or ML training. + +Usage: + python -m dancedeets.jobs.runner --job=dump_potential_events +""" + +import csv +import io +import json +import logging +from typing import Optional + +from dancedeets import fb_api +from dancedeets.jobs.base import BatchJob, JobRunner +from dancedeets.jobs.fb_utils import FBJobContext, get_multiple_tokens +from dancedeets.jobs.gcs_output import GCSOutputWriter, DEFAULT_BUCKET +from dancedeets.jobs.metrics import JobMetrics, set_current_metrics + +logger = logging.getLogger(__name__) + + +class DumpPotentialEventsJob(BatchJob): + """ + Job that dumps potential event data to GCS. + + Fetches event data from Facebook and writes as CSV. + """ + + def __init__( + self, + fb_context: Optional[FBJobContext] = None, + bucket_name: str = DEFAULT_BUCKET, + dry_run: bool = False, + ): + super().__init__(batch_size=80) + self.fb_context = fb_context + self.bucket_name = bucket_name + self.dry_run = dry_run + self.output_writer = None + logger.info("DumpPotentialEventsJob initialized") + + def setup(self) -> None: + """Initialize the output writer.""" + if not self.dry_run: + self.output_writer = GCSOutputWriter( + bucket_name=self.bucket_name, + blob_name='exports/potential_events.csv', + content_type='text/csv', + ) + + def run_batch(self, pe_list: list) -> None: + """Process a batch of potential events.""" + # Filter to events with match_score > 0 + pe_list = [x for x in pe_list if x.match_score > 0] + if not pe_list: + self.metrics.increment('batches_empty') + return + + # Get Facebook lookup + if not self.fb_context: + logger.warning("No FB context, skipping batch") + self.metrics.increment('batches_skipped_no_fb') + return + + fbl = self.fb_context.get_fblookup() + + # Request event data from Facebook + fbl.request_multi(fb_api.LookupEvent, [x.fb_event_id for x in pe_list]) + + try: + fbl.batch_fetch() + except Exception as e: + logger.error(f"Error fetching Facebook data: {e}") + self.metrics.increment('batches_failed_fb') + return + + # Build CSV output + csv_file = io.StringIO() + csv_writer = csv.writer(csv_file) + + for pe in pe_list: + try: + result = json.dumps(fbl.fetched_data(fb_api.LookupEvent, pe.fb_event_id)) + cache_key = fbl.key_to_cache_key( + fb_api.generate_key(fb_api.LookupEvent, pe.fb_event_id) + ) + csv_writer.writerow([cache_key, result]) + self.metrics.increment('events_exported') + except fb_api.NoFetchedDataException: + logger.error(f"Skipping row for event id {pe.fb_event_id}") + self.metrics.increment('events_skipped_no_data') + + # Write to GCS + if self.dry_run: + logger.info(f"[DRY RUN] Would write {len(pe_list)} events to GCS") + else: + self.output_writer.write(csv_file.getvalue()) + + self.metrics.increment('batches_processed') + + def teardown(self) -> None: + """Finalize the output.""" + if not self.dry_run and self.output_writer: + uri = self.output_writer.flush() + logger.info(f"Export written to {uri}") + + +def main(dry_run: bool = False, **kwargs) -> None: + """ + Main entry point for the dump_potential_events job. + + Args: + dry_run: If True, don't write to GCS + """ + logger.info("Starting dump_potential_events job") + + # Get tokens for Facebook API access + try: + tokens = get_multiple_tokens(token_count=50) + logger.info(f"Got {len(tokens)} access tokens for rotation") + except Exception as e: + logger.warning(f"Could not get multiple tokens: {e}") + tokens = [] + + fb_context = FBJobContext( + fb_uid='system', + access_tokens=tokens, + allow_cache=False, # Don't pollute cache with this data + ) if tokens else None + + job = DumpPotentialEventsJob( + fb_context=fb_context, + dry_run=dry_run, + ) + set_current_metrics(job.metrics) + + runner = JobRunner(job) + + # Only process events that haven't been looked at + filters = [ + ('looked_at', '=', None), + ] + + runner.run_from_datastore_batched( + entity_kind='dancedeets.event_scraper.potential_events.PotentialEvent', + filters=filters, + batch_size=80, + ) + + job.metrics.log_summary() + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/email_crawl_errors.py b/server/dancedeets/jobs/email_crawl_errors.py new file mode 100644 index 00000000..a70b70fa --- /dev/null +++ b/server/dancedeets/jobs/email_crawl_errors.py @@ -0,0 +1,175 @@ +""" +Cloud Run Job: Email crawl errors from spider jobs. + +Migrated from: dancedeets/classes/class_pipeline.py (EmailErrors) + +This job collects errors from completed spider jobs and sends +an email report. + +Usage: + python -m dancedeets.jobs.runner --job=email_crawl_errors --job_keys=key1,key2 +""" + +import datetime +import json +import logging +import os +from typing import Dict, List + +import scrapinghub + +from dancedeets import keys +from dancedeets.jobs.metrics import JobMetrics, set_current_metrics +from dancedeets.mail import mandrill_api + +logger = logging.getLogger(__name__) + + +def get_shub_project(): + """Get ScrapingHub project connection.""" + api_key = keys.get('scrapinghub_key') + conn = scrapinghub.Connection(api_key) + project = scrapinghub.Project(conn, 27474) + return project + + +def collect_errors(job_keys: List[str]) -> Dict[str, List[str]]: + """ + Collect errors from completed spider jobs. + + Args: + job_keys: List of ScrapingHub job keys + + Returns: + Dict mapping spider name to list of error messages + """ + project = get_shub_project() + error_lines: Dict[str, List[str]] = {} + + for job_key in job_keys: + try: + job = project.job(job_key) + spider_name = job.info.get('spider', job_key) + + # Check for no items scraped + if not job.info.get('items_scraped'): + error_lines.setdefault(spider_name, []).append( + 'Could not find any items.' + ) + + # Collect error-level log entries + for line in job.log(): + if line.get('level', 0) >= 40: # ERROR level + error_lines.setdefault(spider_name, []).append( + line.get('message', str(line)) + ) + + except Exception as e: + logger.error(f"Error collecting logs for {job_key}: {e}") + error_lines.setdefault(job_key, []).append(f"Error collecting logs: {e}") + + return error_lines + + +def send_error_email( + error_lines: Dict[str, List[str]], + run_time: datetime.datetime, + dry_run: bool = False, +) -> None: + """ + Send email report of crawl errors. + + Args: + error_lines: Dict mapping spider name to error messages + run_time: When the crawl started + dry_run: If True, don't actually send email + """ + if not error_lines: + logger.info("No errors to report") + return + + # Build email body + rendered = ["The following crawl errors occurred:"] + for crawler, errors in sorted(error_lines.items()): + rendered.append(f"\n{crawler}:") + rendered.extend(f" - {error}" for error in errors) + + body = '\n'.join(rendered) + logger.warning(body) + + if dry_run: + logger.info("[DRY RUN] Would send error email") + return + + subject = f"Crawl Errors for {run_time.strftime('%b %d, %Y: %H:%M')}" + message = { + 'from_email': 'reports@dancedeets.com', + 'from_name': 'DanceDeets Reports', + 'subject': subject, + 'to': [{ + 'email': 'reports@dancedeets.com', + 'name': 'DanceDeets Reports', + 'type': 'to', + }], + 'text': body, + } + + try: + mandrill_api.send_message(message) + logger.info("Error report email sent") + except Exception as e: + logger.error(f"Error sending email: {e}") + + +def main( + job_keys: str = None, + run_time: str = None, + dry_run: bool = False, + **kwargs, +) -> None: + """ + Main entry point for the email_crawl_errors job. + + Args: + job_keys: Comma-separated list of ScrapingHub job keys + run_time: ISO format timestamp of when crawl started + dry_run: If True, don't send email + """ + logger.info("Starting email_crawl_errors job") + + metrics = JobMetrics() + set_current_metrics(metrics) + + # Parse job keys + if not job_keys: + logger.warning("No job_keys provided") + return + + if isinstance(job_keys, str): + keys_list = [k.strip() for k in job_keys.split(',') if k.strip()] + else: + keys_list = job_keys + + logger.info(f"Checking {len(keys_list)} jobs for errors") + + # Parse run time + if run_time: + try: + crawl_time = datetime.datetime.fromisoformat(run_time) + except ValueError: + crawl_time = datetime.datetime.now() + else: + crawl_time = datetime.datetime.now() + + # Collect and send errors + error_lines = collect_errors(keys_list) + metrics.increment('spiders_checked', len(keys_list)) + metrics.increment('spiders_with_errors', len(error_lines)) + + send_error_email(error_lines, crawl_time, dry_run=dry_run) + + metrics.log_summary() + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/fb_utils.py b/server/dancedeets/jobs/fb_utils.py new file mode 100644 index 00000000..1aefa8ef --- /dev/null +++ b/server/dancedeets/jobs/fb_utils.py @@ -0,0 +1,185 @@ +""" +Facebook API utilities for Cloud Run Jobs. + +Ported from dancedeets.util.fb_mapreduce to work with the new +Cloud Run Jobs framework instead of MapReduce. +""" + +import datetime +import logging +import random +from typing import Any, Dict, List, Optional + +from dancedeets import fb_api +from dancedeets.users import access_tokens + +logger = logging.getLogger(__name__) + + +class FBJobContext: + """ + Context for Facebook API access within a job. + + Replaces the MapReduce context-based token storage with explicit + parameter passing. + """ + + def __init__( + self, + fb_uid: str, + access_token: Optional[str] = None, + access_tokens: Optional[List[str]] = None, + allow_cache: bool = True, + oldest_allowed: Optional[datetime.datetime] = None, + ): + self.fb_uid = fb_uid + self._access_token = access_token + self._access_tokens = access_tokens or [] + self.allow_cache = allow_cache + self.oldest_allowed = oldest_allowed + + @property + def access_token(self) -> str: + """Get an access token, randomly selecting from pool if available.""" + if self._access_tokens: + return random.choice(self._access_tokens) + return self._access_token or '' + + def get_fblookup(self, user: Optional[Any] = None) -> fb_api.FBLookup: + """ + Create an FBLookup instance for API calls. + + Args: + user: Optional user object with fb_uid and fb_access_token + + Returns: + Configured FBLookup instance + """ + if user: + uid = user.fb_uid + token = user.fb_access_token or self.access_token + else: + uid = self.fb_uid + token = self.access_token + + fbl = fb_api.FBLookup(uid, token) + fbl.allow_cache = self.allow_cache + + if self.oldest_allowed is not None: + fbl.db.oldest_allowed = self.oldest_allowed + + return fbl + + +def get_fblookup_params( + fbl: fb_api.FBLookup, + randomize_tokens: bool = False, + token_count: int = 50, +) -> Dict[str, Any]: + """ + Extract parameters from an FBLookup for job configuration. + + This creates a serializable dict that can be passed to job constructors. + + Args: + fbl: Source FBLookup instance + randomize_tokens: If True, fetch multiple tokens for rotation + token_count: Number of tokens to fetch when randomizing + + Returns: + Dict of parameters for FBJobContext + """ + params = { + 'fb_uid': fbl.fb_uid, + 'allow_cache': fbl.allow_cache, + } + + if fbl.db.oldest_allowed != datetime.datetime.min: + params['oldest_allowed'] = fbl.db.oldest_allowed + + if randomize_tokens: + tokens = get_multiple_tokens(token_count=token_count) + logger.info(f'Found {len(tokens)} valid tokens') + if len(tokens) == 0: + raise Exception('No Valid Tokens') + params['access_tokens'] = tokens + else: + params['access_token'] = fbl.access_token + + return params + + +def get_multiple_tokens(token_count: int = 50) -> List[str]: + """ + Get multiple valid access tokens for token rotation. + + For long-running jobs, using multiple tokens helps avoid rate limiting. + + Args: + token_count: Maximum number of tokens to return + + Returns: + List of valid access token strings + """ + return access_tokens.get_multiple_tokens(token_count=token_count) + + +def get_fblookup( + fb_uid: str, + access_token: Optional[str] = None, + access_tokens: Optional[List[str]] = None, + allow_cache: bool = True, + oldest_allowed: Optional[datetime.datetime] = None, + user: Optional[Any] = None, +) -> fb_api.FBLookup: + """ + Create an FBLookup instance for API calls. + + This is a convenience function that mirrors the old MapReduce pattern. + + Args: + fb_uid: Facebook user ID + access_token: Single access token + access_tokens: List of tokens for rotation + allow_cache: Whether to use caching + oldest_allowed: Oldest cache entry to accept + user: Optional user object to get token from + + Returns: + Configured FBLookup instance + """ + ctx = FBJobContext( + fb_uid=fb_uid, + access_token=access_token, + access_tokens=access_tokens, + allow_cache=allow_cache, + oldest_allowed=oldest_allowed, + ) + return ctx.get_fblookup(user=user) + + +def create_fb_context_from_fbl( + fbl: fb_api.FBLookup, + randomize_tokens: bool = False, +) -> FBJobContext: + """ + Create an FBJobContext from an existing FBLookup. + + Useful when starting a job from a web request handler that already + has an authenticated FBLookup. + + Args: + fbl: Source FBLookup instance + randomize_tokens: If True, fetch multiple tokens for rotation + + Returns: + FBJobContext configured from the FBLookup + """ + params = get_fblookup_params(fbl, randomize_tokens=randomize_tokens) + return FBJobContext( + fb_uid=params['fb_uid'], + access_token=params.get('access_token'), + access_tokens=params.get('access_tokens'), + allow_cache=params['allow_cache'], + oldest_allowed=params.get('oldest_allowed'), + ) diff --git a/server/dancedeets/jobs/gcs_output.py b/server/dancedeets/jobs/gcs_output.py new file mode 100644 index 00000000..34f4f4e9 --- /dev/null +++ b/server/dancedeets/jobs/gcs_output.py @@ -0,0 +1,167 @@ +""" +Google Cloud Storage output utilities for Cloud Run Jobs. + +Provides helpers for writing job output to GCS, replacing the +MapReduce GoogleCloudStorageOutputWriter. +""" + +import datetime +import logging +import os +from typing import Iterable, Optional + +from google.cloud import storage + +logger = logging.getLogger(__name__) + +DEFAULT_BUCKET = 'dancedeets-hrd.appspot.com' + + +class GCSOutputWriter: + """ + Writes job output to Google Cloud Storage. + + Usage: + with GCSOutputWriter(bucket, 'output/results.txt') as writer: + for line in results: + writer.write(line) + """ + + def __init__( + self, + bucket_name: str = DEFAULT_BUCKET, + blob_name: Optional[str] = None, + content_type: str = 'text/plain', + include_task_index: bool = True, + ): + self.bucket_name = bucket_name + self._blob_name = blob_name + self.content_type = content_type + self.include_task_index = include_task_index + self._buffer: list = [] + self._client: Optional[storage.Client] = None + + @property + def blob_name(self) -> str: + """Get the blob name, optionally including task index.""" + if self._blob_name is None: + timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S') + job_name = os.environ.get('CLOUD_RUN_JOB', 'job') + self._blob_name = f"jobs/{job_name}/{timestamp}/output.txt" + + if self.include_task_index: + task_count = int(os.environ.get('CLOUD_RUN_TASK_COUNT', '1')) + if task_count > 1: + task_index = int(os.environ.get('CLOUD_RUN_TASK_INDEX', '0')) + name, ext = os.path.splitext(self._blob_name) + return f"{name}-{task_index:05d}{ext}" + + return self._blob_name + + @property + def client(self) -> storage.Client: + """Lazy-loaded GCS client.""" + if self._client is None: + self._client = storage.Client() + return self._client + + def write(self, line: str) -> None: + """Write a line to the buffer.""" + self._buffer.append(line) + + def write_all(self, lines: Iterable[str]) -> None: + """Write multiple lines to the buffer.""" + self._buffer.extend(lines) + + def flush(self) -> str: + """ + Flush the buffer to GCS. + + Returns: + GCS URI of the written file + """ + if not self._buffer: + logger.warning("No content to write to GCS") + return "" + + bucket = self.client.bucket(self.bucket_name) + blob = bucket.blob(self.blob_name) + + content = '\n'.join(str(line) for line in self._buffer) + blob.upload_from_string(content, content_type=self.content_type) + + uri = f"gs://{self.bucket_name}/{self.blob_name}" + logger.info(f"Wrote {len(self._buffer)} lines to {uri}") + + self._buffer = [] + return uri + + def __enter__(self) -> 'GCSOutputWriter': + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> None: + if self._buffer: + self.flush() + + +def write_to_gcs( + content: str, + bucket_name: str = DEFAULT_BUCKET, + blob_name: Optional[str] = None, + content_type: str = 'text/plain', +) -> str: + """ + Convenience function to write content directly to GCS. + + Args: + content: String content to write + bucket_name: GCS bucket name + blob_name: Path within the bucket + content_type: MIME type + + Returns: + GCS URI of the written file + """ + writer = GCSOutputWriter(bucket_name, blob_name, content_type) + writer.write(content) + return writer.flush() + + +def read_from_gcs( + bucket_name: str, + blob_name: str, +) -> str: + """ + Read content from a GCS file. + + Args: + bucket_name: GCS bucket name + blob_name: Path within the bucket + + Returns: + File contents as string + """ + client = storage.Client() + bucket = client.bucket(bucket_name) + blob = bucket.blob(blob_name) + return blob.download_as_text() + + +def list_gcs_blobs( + bucket_name: str, + prefix: str, +) -> list: + """ + List blobs in a GCS bucket with a given prefix. + + Args: + bucket_name: GCS bucket name + prefix: Blob name prefix to filter by + + Returns: + List of blob names + """ + client = storage.Client() + bucket = client.bucket(bucket_name) + blobs = bucket.list_blobs(prefix=prefix) + return [blob.name for blob in blobs] diff --git a/server/dancedeets/jobs/generate_sitemaps.py b/server/dancedeets/jobs/generate_sitemaps.py new file mode 100644 index 00000000..1af25967 --- /dev/null +++ b/server/dancedeets/jobs/generate_sitemaps.py @@ -0,0 +1,219 @@ +""" +Cloud Run Job: Generate XML sitemaps for search engine indexing. + +Migrated from: dancedeets/sitemaps/events.py + +This job generates XML sitemap files for all events and uploads them +to Google Cloud Storage for search engine crawlers. + +Usage: + python -m dancedeets.jobs.runner --job=generate_sitemaps + python -m dancedeets.jobs.runner --job=generate_sitemaps --vertical=STREET --time_period=FUTURE +""" + +import datetime +import logging +from typing import Optional + +from lxml import etree + +from dancedeets.jobs.base import BatchJob, JobRunner +from dancedeets.jobs.gcs_output import GCSOutputWriter, DEFAULT_BUCKET +from dancedeets.jobs.metrics import JobMetrics, set_current_metrics +from dancedeets.util import urls + +logger = logging.getLogger(__name__) + + +def generate_sitemap_entry(event) -> Optional[str]: + """ + Generate a single sitemap XML entry for an event. + + Args: + event: DBEvent instance + + Returns: + XML string for the URL entry, or None if event should be skipped + """ + if not event.has_content(): + return None + + url_node = etree.Element('url') + + # Location + loc_node = etree.Element('loc') + loc_node.text = urls.dd_event_url(event) + url_node.append(loc_node) + + # Last modified (from Facebook updated_time) + if event.is_fb_event: + if 'updated_time' in event.fb_event.get('info', {}): + lastmod_node = etree.Element('lastmod') + updated = event.fb_event['info']['updated_time'] + updated = updated.replace('+0000', '+00:00') + lastmod_node.text = updated + url_node.append(lastmod_node) + else: + logger.debug(f'Event {event.id} does not have updated_time') + + # Calculate timing-based metadata + if event.end_time: + end_time = event.end_time + else: + end_time = event.start_time + datetime.timedelta(hours=2) + + start_time_delta = event.start_time - datetime.datetime.now() + end_time_delta = end_time - datetime.datetime.now() + event_delta = end_time - event.start_time + + # Change frequency and priority + changefreq_node = etree.Element('changefreq') + priority_node = etree.Element('priority') + priority_node.text = '0.5' + + # Event is active and not a multi-week event + if event_delta.days < 7 and start_time_delta.days <= 1 and end_time_delta.days >= 0: + changefreq_node.text = 'hourly' + # If it ended awhile ago + elif end_time_delta.days < -30: + changefreq_node.text = 'yearly' + priority_node.text = '0.1' + elif end_time_delta.days < -10: + changefreq_node.text = 'weekly' + # If it's coming up soon + elif start_time_delta.days < 30: + changefreq_node.text = 'daily' + else: + changefreq_node.text = 'weekly' + + url_node.append(changefreq_node) + url_node.append(priority_node) + + # Return as single line + return etree.tostring(url_node, encoding='unicode') + + +class GenerateSitemapsJob(BatchJob): + """ + Job that generates XML sitemaps for events. + + Processes events in batches and writes sitemap entries to GCS. + """ + + def __init__( + self, + vertical: Optional[str] = None, + time_period: Optional[str] = None, + bucket_name: str = DEFAULT_BUCKET, + dry_run: bool = False, + ): + super().__init__(batch_size=20) + self.vertical = vertical + self.time_period = time_period + self.bucket_name = bucket_name + self.dry_run = dry_run + + # Build output path + parts = ['sitemaps'] + if vertical: + parts.append(vertical.lower()) + if time_period: + parts.append(time_period.lower()) + timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S') + parts.append(f'sitemap-{timestamp}.xml') + self.blob_name = '/'.join(parts) + + self.output_writer = None + logger.info(f"GenerateSitemapsJob initialized: vertical={vertical}, time_period={time_period}") + + def setup(self) -> None: + """Initialize the output writer.""" + if not self.dry_run: + self.output_writer = GCSOutputWriter( + bucket_name=self.bucket_name, + blob_name=self.blob_name, + content_type='text/xml', + ) + # Write XML header + self.output_writer.write('') + self.output_writer.write('') + + def run_batch(self, events: list) -> None: + """Process a batch of events.""" + for event in events: + try: + entry = generate_sitemap_entry(event) + if entry: + if self.dry_run: + logger.debug(f"Would write sitemap entry for event {event.id}") + self.metrics.increment('entries_would_write') + else: + self.output_writer.write(entry) + self.metrics.increment('entries_written') + else: + self.metrics.increment('events_skipped_no_content') + except Exception as e: + logger.error(f"Error generating sitemap for event {event.id}: {e}") + self.metrics.increment('events_failed') + + self.metrics.increment('events_processed', len(events)) + + def teardown(self) -> None: + """Finalize and upload the sitemap.""" + if self.dry_run: + logger.info("[DRY RUN] Would write sitemap to GCS") + return + + if self.output_writer: + # Write closing tag + self.output_writer.write('') + uri = self.output_writer.flush() + logger.info(f"Sitemap written to {uri}") + + +def main( + vertical: Optional[str] = None, + time_period: Optional[str] = None, + bucket_name: str = DEFAULT_BUCKET, + dry_run: bool = False, + **kwargs, +) -> None: + """ + Main entry point for the generate_sitemaps job. + + Args: + vertical: Optional vertical filter (e.g., 'STREET') + time_period: Optional time period filter (e.g., 'FUTURE', 'PAST') + bucket_name: GCS bucket for output + dry_run: If True, don't write to GCS + """ + logger.info(f"Starting generate_sitemaps job: vertical={vertical}, time_period={time_period}") + + job = GenerateSitemapsJob( + vertical=vertical, + time_period=time_period, + bucket_name=bucket_name, + dry_run=dry_run, + ) + set_current_metrics(job.metrics) + + runner = JobRunner(job) + + # Build filters + filters = [] + if vertical: + filters.append(('verticals', '=', vertical)) + if time_period: + filters.append(('search_time_period', '=', time_period)) + + runner.run_from_datastore_batched( + entity_kind='dancedeets.events.eventdata.DBEvent', + filters=filters, + batch_size=20, + ) + + job.metrics.log_summary() + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/generate_training_data.py b/server/dancedeets/jobs/generate_training_data.py new file mode 100644 index 00000000..2284bcc0 --- /dev/null +++ b/server/dancedeets/jobs/generate_training_data.py @@ -0,0 +1,228 @@ +""" +Cloud Run Job: Generate ML training data from potential events. + +Migrated from: dancedeets/ml/gprediction.py + +This job generates training data for the ML classifier by extracting +features from potential events and writing them to GCS. + +Usage: + python -m dancedeets.jobs.runner --job=generate_training_data +""" + +import csv +import io +import logging +import string +from typing import Optional + +from dancedeets import fb_api +from dancedeets.events import eventdata +from dancedeets.events import event_locations +from dancedeets.jobs.base import BatchJob, JobRunner +from dancedeets.jobs.fb_utils import FBJobContext, get_multiple_tokens +from dancedeets.jobs.gcs_output import GCSOutputWriter, DEFAULT_BUCKET +from dancedeets.jobs.metrics import JobMetrics, set_current_metrics + +logger = logging.getLogger(__name__) + +# Character translation for stripping punctuation +convert_chars = string.punctuation + '\r\n\t' +trans = str.maketrans(convert_chars, ' ' * len(convert_chars)) + + +def strip_punctuation(s: str) -> str: + """Remove punctuation from a string.""" + return s.translate(trans) + + +def get_training_features(potential_event, fb_event, fb_event_attending) -> tuple: + """ + Extract training features from an event. + + Returns: + Tuple of feature values + """ + if 'owner' in fb_event['info']: + owner_name = 'id%s' % fb_event['info']['owner']['id'] + else: + owner_name = '' + + location = event_locations.get_address_for_fb_event(fb_event) + + def strip_text(s): + if isinstance(s, bytes): + s = s.decode('utf-8') + return strip_punctuation(s).lower() + + name = strip_text(fb_event['info'].get('name', '')) + description = strip_text(fb_event['info'].get('description', '')) + + attendee_list = ' '.join([ + 'id%s' % x['id'] + for x in fb_event_attending.get('attending', {}).get('data', []) + ]) + + source_list = ' '.join( + 'id%s' % x.id + for x in potential_event.source_ids_only() + ) + + # Currently only returning attendee_list (as per original code) + return (attendee_list,) + # Full features would be: + # return (potential_event.language, owner_name, location, name, description, attendee_list, source_list) + + +class GenerateTrainingDataJob(BatchJob): + """ + Job that generates ML training data from potential events. + + Extracts features from events and writes CSV training data to GCS. + """ + + def __init__( + self, + fb_context: Optional[FBJobContext] = None, + bucket_name: str = DEFAULT_BUCKET, + dry_run: bool = False, + ): + super().__init__(batch_size=20) + self.fb_context = fb_context + self.bucket_name = bucket_name + self.dry_run = dry_run + self.output_writer = None + logger.info("GenerateTrainingDataJob initialized") + + def setup(self) -> None: + """Initialize the output writer.""" + if not self.dry_run: + self.output_writer = GCSOutputWriter( + bucket_name=self.bucket_name, + blob_name='ml/training_data.csv', + content_type='text/csv', + ) + + def run_batch(self, pevents: list) -> None: + """Process a batch of potential events.""" + if not self.fb_context: + logger.warning("No FB context, skipping batch") + self.metrics.increment('batches_skipped_no_fb') + return + + fbl = self.fb_context.get_fblookup() + fbl.allow_memcache_write = False # Don't pollute memcache + + # Only process events that have been looked at + fb_event_ids = [x.fb_event_id for x in pevents if x.looked_at] + if not fb_event_ids: + self.metrics.increment('batches_empty') + return + + # Fetch from Facebook + fbl.request_multi(fb_api.LookupEvent, fb_event_ids) + fbl.request_multi(fb_api.LookupEventAttending, fb_event_ids) + + try: + fbl.batch_fetch() + except Exception as e: + logger.error(f"Error fetching Facebook data: {e}") + self.metrics.increment('batches_failed_fb') + return + + # Get existing events to determine labels + good_event_ids = [ + x.fb_event_id + for x in eventdata.DBEvent.get_by_ids(fb_event_ids, keys_only=True) + if x + ] + + # Build CSV + csv_file = io.StringIO() + csv_writer = csv.writer(csv_file) + + for potential_event in pevents: + if not potential_event.looked_at: + continue + + try: + # Label: 'dance' if event exists in DB, 'nodance' otherwise + label = 'dance' if potential_event.fb_event_id in good_event_ids else 'nodance' + + fb_event = fbl.fetched_data(fb_api.LookupEvent, potential_event.fb_event_id) + if fb_event.get('empty'): + self.metrics.increment('events_skipped_empty') + continue + + fb_event_attending = fbl.fetched_data( + fb_api.LookupEventAttending, + potential_event.fb_event_id + ) + + training_features = get_training_features( + potential_event, fb_event, fb_event_attending + ) + csv_writer.writerow([label] + list(training_features)) + self.metrics.increment('rows_written') + + except fb_api.NoFetchedDataException: + logger.debug(f"No data fetched for event id {potential_event.fb_event_id}") + self.metrics.increment('events_skipped_no_data') + + # Write to GCS + output = csv_file.getvalue() + if output: + if self.dry_run: + logger.info(f"[DRY RUN] Would write training data to GCS") + else: + self.output_writer.write(output) + + self.metrics.increment('batches_processed') + + def teardown(self) -> None: + """Finalize the output.""" + if not self.dry_run and self.output_writer: + uri = self.output_writer.flush() + logger.info(f"Training data written to {uri}") + + +def main(dry_run: bool = False, **kwargs) -> None: + """ + Main entry point for the generate_training_data job. + + Args: + dry_run: If True, don't write to GCS + """ + logger.info("Starting generate_training_data job") + + # Get tokens for Facebook API access + try: + tokens = get_multiple_tokens(token_count=50) + logger.info(f"Got {len(tokens)} access tokens for rotation") + except Exception as e: + logger.warning(f"Could not get multiple tokens: {e}") + tokens = [] + + fb_context = FBJobContext( + fb_uid='system', + access_tokens=tokens, + allow_cache=True, + ) if tokens else None + + job = GenerateTrainingDataJob( + fb_context=fb_context, + dry_run=dry_run, + ) + set_current_metrics(job.metrics) + + runner = JobRunner(job) + runner.run_from_datastore_batched( + entity_kind='dancedeets.event_scraper.potential_events.PotentialEvent', + batch_size=20, + ) + + job.metrics.log_summary() + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/metrics.py b/server/dancedeets/jobs/metrics.py new file mode 100644 index 00000000..e3b7be82 --- /dev/null +++ b/server/dancedeets/jobs/metrics.py @@ -0,0 +1,151 @@ +""" +Metrics tracking for Cloud Run Jobs. + +Replaces MapReduce counters with in-memory tracking and optional +Cloud Monitoring integration. +""" + +import logging +import os +from collections import defaultdict +from typing import Dict, Optional + +logger = logging.getLogger(__name__) + + +class JobMetrics: + """ + In-memory counter implementation for job metrics. + + Replaces MapReduce op.counters.Increment with a simple dict-based + counter that can optionally export to Cloud Monitoring. + """ + + def __init__(self, job_name: Optional[str] = None): + self.job_name = job_name or os.environ.get('CLOUD_RUN_JOB', 'unknown') + self._counters: Dict[str, int] = defaultdict(int) + + def increment(self, key: str, delta: int = 1) -> None: + """ + Increment a counter. + + Args: + key: Counter name + delta: Amount to increment (default 1) + """ + self._counters[key] += delta + + def get(self, key: str) -> int: + """ + Get the current value of a counter. + + Args: + key: Counter name + + Returns: + Current counter value (0 if not set) + """ + return self._counters.get(key, 0) + + def get_all(self) -> Dict[str, int]: + """ + Get all counter values. + + Returns: + Dict of counter names to values + """ + return dict(self._counters) + + def log_summary(self) -> None: + """Log a summary of all counters.""" + logger.info(f"Job metrics for {self.job_name}:") + for key, value in sorted(self._counters.items()): + logger.info(f" {key}: {value}") + + def export_to_cloud_monitoring(self) -> None: + """ + Export metrics to Cloud Monitoring. + + This is optional and requires the google-cloud-monitoring package. + """ + try: + from google.cloud import monitoring_v3 + + client = monitoring_v3.MetricServiceClient() + project_name = f"projects/{os.environ.get('GOOGLE_CLOUD_PROJECT', 'dancedeets-hrd')}" + + for key, value in self._counters.items(): + # Create a custom metric descriptor if needed + # This is a simplified version - full implementation would + # create proper metric descriptors + logger.info(f"Would export to Cloud Monitoring: {key}={value}") + + except ImportError: + logger.warning("google-cloud-monitoring not installed, skipping export") + except Exception as e: + logger.error(f"Error exporting to Cloud Monitoring: {e}") + + +class GroupedMetrics: + """ + Metrics that can be grouped by a key (e.g., city, time_period). + + Useful for ranking-style aggregations. + """ + + def __init__(self): + self._groups: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int)) + + def increment(self, group_key: str, counter_key: str, delta: int = 1) -> None: + """ + Increment a counter within a group. + + Args: + group_key: The group identifier (e.g., city name) + counter_key: The counter name within the group + delta: Amount to increment + """ + self._groups[group_key][counter_key] += delta + + def get_group(self, group_key: str) -> Dict[str, int]: + """Get all counters for a group.""" + return dict(self._groups.get(group_key, {})) + + def get_all_groups(self) -> Dict[str, Dict[str, int]]: + """Get all groups and their counters.""" + return {k: dict(v) for k, v in self._groups.items()} + + def get_totals(self, counter_key: str) -> Dict[str, int]: + """Get totals for a specific counter across all groups.""" + return { + group_key: counters.get(counter_key, 0) + for group_key, counters in self._groups.items() + if counters.get(counter_key, 0) > 0 + } + + +# Global metrics instance for compatibility with old mr.increment() pattern +_current_metrics: Optional[JobMetrics] = None + + +def set_current_metrics(metrics: JobMetrics) -> None: + """Set the current job metrics instance (for compatibility).""" + global _current_metrics + _current_metrics = metrics + + +def get_current_metrics() -> Optional[JobMetrics]: + """Get the current job metrics instance.""" + return _current_metrics + + +def increment(key: str, delta: int = 1) -> None: + """ + Increment a counter (compatibility wrapper). + + This provides the same interface as the old mr.increment() function. + """ + if _current_metrics: + _current_metrics.increment(key, delta) + else: + logger.warning(f"No current metrics context, cannot increment {key}") diff --git a/server/dancedeets/jobs/notify_users.py b/server/dancedeets/jobs/notify_users.py new file mode 100644 index 00000000..33cbed16 --- /dev/null +++ b/server/dancedeets/jobs/notify_users.py @@ -0,0 +1,170 @@ +""" +Cloud Run Job: Send push notifications about new events to users. + +Migrated from: dancedeets/notifications/added_events.py + +This job runs hourly and sends notifications to users in a specific +timezone about recently added events near them. + +Usage: + python -m dancedeets.jobs.runner --job=notify_users --offset=8 + python -m dancedeets.jobs.runner --job=notify_users # auto-calculates offset +""" + +import datetime +import logging +import time + +from google.cloud import datastore + +from dancedeets.jobs.base import Job, JobRunner +from dancedeets.jobs.metrics import JobMetrics, set_current_metrics +from dancedeets.loc import gmaps_api +from dancedeets.loc import math as loc_math +from dancedeets.notifications import android +from dancedeets.search import search +from dancedeets.search import search_base + +logger = logging.getLogger(__name__) + + +def get_time_offset() -> float: + """ + Calculate the timezone offset to target for 4pm local time notifications. + + Returns: + Float timezone offset (e.g., 8.0 for UTC+8) + """ + desired_hour = 16 # send new-event notifications at 4pm + current_hour = datetime.datetime.now().hour # should be UTC hour + offset = desired_hour - current_hour + if offset <= -12: + offset += 24 + if offset > 12: + offset -= 24 + return float(offset) + + +class NotifyUsersJob(Job): + """ + Job that sends push notifications about new events to users. + + For each user in the target timezone: + 1. Check if they can receive Android notifications + 2. Search for events near their location added in the last 24 hours + 3. Send push notifications for each new event + """ + + def __init__(self, offset: float = None, dry_run: bool = False): + super().__init__() + self.offset = offset if offset is not None else get_time_offset() + self.dry_run = dry_run + logger.info(f"NotifyUsersJob initialized for timezone offset {self.offset}") + + def run(self, user) -> None: + """Process a single user.""" + # Check if user can receive notifications + if not android.can_notify(user): + self.metrics.increment('users_skipped_no_android') + return + + if not user: + logger.error("No user provided") + return + + if user.expired_oauth_token: + logger.info(f"User has expired token, skipping: {user.fb_uid}") + self.metrics.increment('users_skipped_expired_token') + return + + user_location = user.location + if not user_location: + self.metrics.increment('users_skipped_no_location') + return + + logger.info(f"Processing user {user.fb_uid}") + + distance_in_km = user.distance_in_km() + min_attendees = user.min_attendees + + # Search for relevant events + geocode = gmaps_api.lookup_address(user_location) + if not geocode: + self.metrics.increment('users_skipped_geocode_failed') + return + + bounds = loc_math.expand_bounds(geocode.latlng_bounds(), distance_in_km) + query = search_base.SearchQuery( + time_period=search_base.TIME_UPCOMING, + bounds=bounds, + min_attendees=min_attendees, + ) + + one_day_ago = time.mktime( + (datetime.datetime.now() - datetime.timedelta(hours=24)).timetuple() + ) + + search_query = search.Search(query) + search_query.extra_fields = ['creation_time'] + search_results = search_query._get_candidate_doc_events() + + # Filter to recently added events + recent_events = [ + x.doc_id + for x in search_results + if x.field('creation_time').value > one_day_ago + ] + + logger.info( + f"Found {len(search_results)} search results, " + f"{len(recent_events)} new events for user {user.fb_uid}" + ) + + self.metrics.increment('events_found', len(recent_events)) + + for event_id in recent_events: + if self.dry_run: + logger.info(f"[DRY RUN] Would notify user {user.fb_uid} about event {event_id}") + self.metrics.increment('notifications_would_send') + else: + if android.add_notify(user, event_id): + logger.info(f"Sent notification to {user.fb_uid} for event {event_id}") + self.metrics.increment('notifications_sent') + + self.metrics.increment('users_processed') + + +def main(offset: float = None, dry_run: bool = False, **kwargs) -> None: + """ + Main entry point for the notify_users job. + + Args: + offset: Timezone offset to target (auto-calculated if not provided) + dry_run: If True, don't actually send notifications + """ + if offset is None: + offset = get_time_offset() + + logger.info(f"Starting notify_users job for timezone offset {offset}") + + job = NotifyUsersJob(offset=offset, dry_run=dry_run) + set_current_metrics(job.metrics) + + runner = JobRunner(job) + + # Query users in the target timezone range + filters = [ + ('timezone_offset', '>=', offset), + ('timezone_offset', '<', offset + 1), + ] + + runner.run_from_datastore( + entity_kind='dancedeets.users.users.User', + filters=filters, + ) + + job.metrics.log_summary() + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/post_japan_events.py b/server/dancedeets/jobs/post_japan_events.py new file mode 100644 index 00000000..670d4f2e --- /dev/null +++ b/server/dancedeets/jobs/post_japan_events.py @@ -0,0 +1,93 @@ +""" +Cloud Run Job: Post future Japan events to social media. + +Migrated from: dancedeets/pubsub/pubsub_tasks.py + +This job finds all future dance events in Japan and publishes them +to configured social media accounts. + +Usage: + python -m dancedeets.jobs.runner --job=post_japan_events + python -m dancedeets.jobs.runner --job=post_japan_events --token_nickname=twitter_jp +""" + +import logging + +from dancedeets.jobs.base import BatchJob, JobRunner +from dancedeets.jobs.metrics import JobMetrics, set_current_metrics +from dancedeets.pubsub import pubsub +from dancedeets.util import dates + +logger = logging.getLogger(__name__) + + +class PostJapanEventsJob(BatchJob): + """ + Job that posts Japan events to social media. + + Processes events in batches, filtering to only those in Japan, + then publishes each to social media via the pubsub module. + """ + + def __init__(self, token_nickname: str = None, dry_run: bool = False): + super().__init__(batch_size=20) + self.token_nickname = token_nickname + self.dry_run = dry_run + logger.info(f"PostJapanEventsJob initialized with token_nickname={token_nickname}") + + def run_batch(self, db_events: list) -> None: + """Process a batch of events.""" + # Filter to Japan events + japan_events = [ + event for event in db_events + if event.actual_city_name and event.actual_city_name.endswith('Japan') + ] + + logger.info(f"Batch: {len(db_events)} events, {len(japan_events)} in Japan") + self.metrics.increment('events_total', len(db_events)) + self.metrics.increment('events_in_japan', len(japan_events)) + + for db_event in japan_events: + try: + if self.dry_run: + logger.info(f"[DRY RUN] Would publish event {db_event.id}") + self.metrics.increment('events_would_publish') + else: + pubsub.eventually_publish_event(db_event.id, self.token_nickname) + self.metrics.increment('events_published') + except Exception as e: + logger.error(f"Error publishing event {db_event.id}: {e}") + self.metrics.increment('events_failed') + + +def main(token_nickname: str = None, dry_run: bool = False, **kwargs) -> None: + """ + Main entry point for the post_japan_events job. + + Args: + token_nickname: Optional social media token nickname to use + dry_run: If True, don't actually publish events + """ + logger.info(f"Starting post_japan_events job") + + job = PostJapanEventsJob(token_nickname=token_nickname, dry_run=dry_run) + set_current_metrics(job.metrics) + + runner = JobRunner(job) + + # Query future events + filters = [ + ('search_time_period', '=', dates.TIME_FUTURE), + ] + + runner.run_from_datastore_batched( + entity_kind='dancedeets.events.eventdata.DBEvent', + filters=filters, + batch_size=20, + ) + + job.metrics.log_summary() + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/refresh_users.py b/server/dancedeets/jobs/refresh_users.py new file mode 100644 index 00000000..5a02b533 --- /dev/null +++ b/server/dancedeets/jobs/refresh_users.py @@ -0,0 +1,174 @@ +""" +Cloud Run Job: Refresh user profiles from Facebook. + +Migrated from: dancedeets/users/user_tasks.py + +This job refreshes user profile information from the Facebook API +and updates the local user records. + +Usage: + python -m dancedeets.jobs.runner --job=refresh_users + python -m dancedeets.jobs.runner --job=refresh_users --all_users=true +""" + +import logging +from typing import Optional + +from dancedeets import fb_api +from dancedeets.jobs.base import Job, JobRunner +from dancedeets.jobs.fb_utils import FBJobContext, get_fblookup_params, get_multiple_tokens +from dancedeets.jobs.metrics import JobMetrics, set_current_metrics +from dancedeets.mail import mailchimp_api + +logger = logging.getLogger(__name__) + + +class RefreshUsersJob(Job): + """ + Job that refreshes user profiles from Facebook. + + For each user: + 1. Check if they have a valid access token + 2. Fetch updated profile info from Facebook + 3. Update local user record + 4. Optionally update Mailchimp subscription + """ + + def __init__( + self, + fb_context: Optional[FBJobContext] = None, + mailchimp_list_id: Optional[str] = None, + dry_run: bool = False, + ): + super().__init__() + self.fb_context = fb_context + self.mailchimp_list_id = mailchimp_list_id or mailchimp_api.get_list_id() + self.dry_run = dry_run + logger.info(f"RefreshUsersJob initialized, mailchimp_list_id={mailchimp_list_id}") + + def run(self, user) -> None: + """Process a single user.""" + if user.expired_oauth_token: + logger.info( + f"Skipping user {user.fb_uid} ({user.full_name}) " + "due to expired access_token" + ) + self.metrics.increment('users_skipped_expired') + if not self.dry_run: + user.put() # Save any pending changes + return + + # Get access token (prefer user's own token, fall back to context) + access_token = user.fb_access_token + if not access_token and self.fb_context: + access_token = self.fb_context.access_token + + if not access_token: + logger.info( + f"Skipping user {user.fb_uid} ({user.full_name}) " + "due to not having an access_token" + ) + self.metrics.increment('users_skipped_no_token') + if not self.dry_run: + user.put() + return + + # Fetch and update user from Facebook + try: + self._fetch_and_save_fb_user(user, access_token) + self.metrics.increment('users_refreshed') + except Exception as e: + logger.error(f"Error refreshing user {user.fb_uid}: {e}") + self.metrics.increment('users_failed') + + def _fetch_and_save_fb_user(self, user, access_token: str) -> None: + """Fetch user data from Facebook and save.""" + fbl = fb_api.FBLookup(user.fb_uid, access_token) + + if self.fb_context: + fbl.allow_cache = self.fb_context.allow_cache + if self.fb_context.oldest_allowed: + fbl.db.oldest_allowed = self.fb_context.oldest_allowed + + try: + fb_user = fbl.get(fb_api.LookupUser, user.fb_uid) + except fb_api.ExpiredOAuthToken as e: + logger.info(f"Auth token now expired for {user.fb_uid}: {e}") + user.expired_oauth_token_reason = str(e.args[0]) if e.args else "Unknown" + user.expired_oauth_token = True + if not self.dry_run: + user.put() + self.metrics.increment('users_token_expired') + return + + if self.dry_run: + logger.info(f"[DRY RUN] Would update user {user.fb_uid}") + return + + user.compute_derived_properties(fb_user) + user.put() + + # Update Mailchimp if configured + # Note: mailchimp update is typically handled by user.put() via signals + + +def main( + all_users: bool = False, + dry_run: bool = False, + **kwargs, +) -> None: + """ + Main entry point for the refresh_users job. + + Args: + all_users: If True, include users with expired tokens + dry_run: If True, don't save changes + """ + logger.info(f"Starting refresh_users job, all_users={all_users}") + + # Get tokens for Facebook API access + try: + tokens = get_multiple_tokens(token_count=50) + logger.info(f"Got {len(tokens)} access tokens for rotation") + except Exception as e: + logger.warning(f"Could not get multiple tokens: {e}") + tokens = [] + + # Create FB context with token pool + fb_context = FBJobContext( + fb_uid='system', # System-level access + access_tokens=tokens, + allow_cache=True, + ) if tokens else None + + # Get Mailchimp list ID + try: + mailchimp_list_id = mailchimp_api.get_list_id() + except Exception as e: + logger.warning(f"Could not get Mailchimp list ID: {e}") + mailchimp_list_id = None + + job = RefreshUsersJob( + fb_context=fb_context, + mailchimp_list_id=mailchimp_list_id, + dry_run=dry_run, + ) + set_current_metrics(job.metrics) + + runner = JobRunner(job) + + # Build filters + filters = [] + if not all_users: + filters.append(('expired_oauth_token', '=', False)) + + runner.run_from_datastore( + entity_kind='dancedeets.users.users.User', + filters=filters, + ) + + job.metrics.log_summary() + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/reindex_classes.py b/server/dancedeets/jobs/reindex_classes.py new file mode 100644 index 00000000..d43c29e3 --- /dev/null +++ b/server/dancedeets/jobs/reindex_classes.py @@ -0,0 +1,50 @@ +""" +Cloud Run Job: Reindex dance classes in search. + +Migrated from: dancedeets/classes/class_pipeline.py (ReindexClasses) + +This job rebuilds the dance class search index from scraped data. + +Usage: + python -m dancedeets.jobs.runner --job=reindex_classes +""" + +import logging + +from dancedeets.classes import class_index +from dancedeets.jobs.metrics import JobMetrics, set_current_metrics + +logger = logging.getLogger(__name__) + + +def main(dry_run: bool = False, **kwargs) -> None: + """ + Main entry point for the reindex_classes job. + + Args: + dry_run: If True, don't actually reindex + """ + logger.info("Starting reindex_classes job") + + metrics = JobMetrics() + set_current_metrics(metrics) + + if dry_run: + logger.info("[DRY RUN] Would rebuild class index") + metrics.increment('reindex_skipped') + else: + logger.info("Rebuilding class index...") + try: + class_index.StudioClassIndex.rebuild_from_query() + logger.info("Class index rebuilt successfully") + metrics.increment('reindex_completed') + except Exception as e: + logger.error(f"Error rebuilding class index: {e}") + metrics.increment('reindex_failed') + raise + + metrics.log_summary() + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/runner.py b/server/dancedeets/jobs/runner.py new file mode 100644 index 00000000..8a0c3419 --- /dev/null +++ b/server/dancedeets/jobs/runner.py @@ -0,0 +1,141 @@ +""" +Cloud Run Job runner entry point. + +This module provides a CLI interface for running jobs from Cloud Run. +Jobs are specified by name and executed with the provided parameters. + +Usage: + python -m dancedeets.jobs.runner --job=notify_users --offset=8 + python -m dancedeets.jobs.runner --job=generate_sitemaps --vertical=STREET +""" + +import argparse +import importlib +import logging +import os +import sys + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + stream=sys.stdout, +) +logger = logging.getLogger(__name__) + +# Registry of available jobs +JOB_REGISTRY = { + # Phase 2: Simple mapper jobs + 'notify_users': 'dancedeets.jobs.notify_users', + 'post_japan_events': 'dancedeets.jobs.post_japan_events', + 'compute_rankings': 'dancedeets.jobs.compute_rankings', + 'compute_user_stats': 'dancedeets.jobs.compute_user_stats', + 'refresh_users': 'dancedeets.jobs.refresh_users', + 'send_weekly_emails': 'dancedeets.jobs.send_weekly_emails', + + # Phase 3: GCS output jobs + 'generate_sitemaps': 'dancedeets.jobs.generate_sitemaps', + 'dump_potential_events': 'dancedeets.jobs.dump_potential_events', + 'generate_training_data': 'dancedeets.jobs.generate_training_data', + 'classify_events_ml': 'dancedeets.jobs.classify_events_ml', + 'auto_add_events': 'dancedeets.jobs.auto_add_events', + + # Phase 4: MapReduce pipeline jobs + 'count_unique_attendees': 'dancedeets.jobs.count_unique_attendees', + 'update_source_stats': 'dancedeets.jobs.update_source_stats', + 'scrape_and_classify': 'dancedeets.jobs.scrape_and_classify', + 'find_access_tokens': 'dancedeets.jobs.find_access_tokens', + + # Phase 5: Pipeline orchestration jobs (individual steps) + 'start_spiders': 'dancedeets.jobs.start_spiders', + 'reindex_classes': 'dancedeets.jobs.reindex_classes', + 'email_crawl_errors': 'dancedeets.jobs.email_crawl_errors', +} + + +def run_job(job_name: str, **kwargs) -> None: + """ + Run a job by name with the given parameters. + + Args: + job_name: Name of the job from JOB_REGISTRY + **kwargs: Job-specific parameters + """ + if job_name not in JOB_REGISTRY: + available = ', '.join(sorted(JOB_REGISTRY.keys())) + raise ValueError(f"Unknown job: {job_name}. Available jobs: {available}") + + module_path = JOB_REGISTRY[job_name] + logger.info(f"Loading job module: {module_path}") + + try: + module = importlib.import_module(module_path) + except ImportError as e: + logger.error(f"Failed to import job module {module_path}: {e}") + raise + + if not hasattr(module, 'main'): + raise ValueError(f"Job module {module_path} must have a main() function") + + logger.info(f"Running job: {job_name}") + logger.info(f"Parameters: {kwargs}") + + # Cloud Run Job environment info + task_index = os.environ.get('CLOUD_RUN_TASK_INDEX', '0') + task_count = os.environ.get('CLOUD_RUN_TASK_COUNT', '1') + logger.info(f"Task {int(task_index) + 1} of {task_count}") + + try: + module.main(**kwargs) + logger.info(f"Job {job_name} completed successfully") + except Exception as e: + logger.error(f"Job {job_name} failed: {e}") + raise + + +def parse_args(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser(description='Run a DanceDeets batch job') + parser.add_argument( + '--job', + required=True, + help='Name of the job to run', + ) + parser.add_argument( + '--dry-run', + action='store_true', + help='Run in dry-run mode (no side effects)', + ) + + # Allow arbitrary additional arguments + args, unknown = parser.parse_known_args() + + # Parse unknown args as key=value pairs + extra_args = {} + for arg in unknown: + if '=' in arg: + key, value = arg.split('=', 1) + key = key.lstrip('-') + # Try to convert to appropriate types + if value.lower() in ('true', 'false'): + value = value.lower() == 'true' + elif value.isdigit(): + value = int(value) + elif value.replace('.', '').isdigit(): + value = float(value) + extra_args[key] = value + + return args, extra_args + + +def main(): + """Main entry point for the job runner.""" + args, extra_args = parse_args() + + if args.dry_run: + extra_args['dry_run'] = True + + run_job(args.job, **extra_args) + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/scrape_and_classify.py b/server/dancedeets/jobs/scrape_and_classify.py new file mode 100644 index 00000000..c873e673 --- /dev/null +++ b/server/dancedeets/jobs/scrape_and_classify.py @@ -0,0 +1,192 @@ +""" +Cloud Run Job: Scrape sources for events and classify them. + +Migrated from: dancedeets/event_scraper/thing_scraper2.py + +This job scrapes configured sources (fan pages, profiles, etc.) for +event listings, then classifies discovered events. + +Usage: + python -m dancedeets.jobs.runner --job=scrape_and_classify + python -m dancedeets.jobs.runner --job=scrape_and_classify --min_potential_events=5 +""" + +import json +import logging +from collections import defaultdict +from typing import Dict, List, Optional, Set + +from dancedeets.jobs.base import BatchJob, JobRunner +from dancedeets.jobs.fb_utils import FBJobContext, get_multiple_tokens +from dancedeets.jobs.metrics import JobMetrics, set_current_metrics +from dancedeets.event_scraper import event_pipeline +from dancedeets.event_scraper import potential_events +from dancedeets.event_scraper import thing_scraper + +logger = logging.getLogger(__name__) + + +class ScrapeAndClassifyJob(BatchJob): + """ + Job that scrapes sources and classifies discovered events. + + This combines the map and reduce steps from the original MapReduce: + 1. Map: Scrape each source for events + 2. Reduce: Group by event_id and process + + Uses in-memory aggregation since event volumes are manageable. + """ + + def __init__( + self, + fb_context: Optional[FBJobContext] = None, + min_potential_events: int = 0, + dry_run: bool = False, + ): + super().__init__(batch_size=20) + self.fb_context = fb_context + self.min_potential_events = min_potential_events + self.dry_run = dry_run + + # Aggregate discovered events by event_id + # event_id -> list of (source_id, source_field, extra_source_id) + self.discovered_events: Dict[str, List[tuple]] = defaultdict(list) + + logger.info(f"ScrapeAndClassifyJob initialized with min_potential_events={min_potential_events}") + + def run_batch(self, sources: list) -> None: + """Process a batch of sources (scraping phase).""" + # Filter sources by min_potential_events + if self.min_potential_events > 0: + sources = [ + s for s in sources + if (s.num_potential_events or 0) >= self.min_potential_events + ] + + if not sources: + self.metrics.increment('batches_empty') + return + + if not self.fb_context: + logger.warning("No FB context, skipping batch") + self.metrics.increment('batches_skipped_no_fb') + return + + fbl = self.fb_context.get_fblookup() + fbl.allow_cache = False # Don't cache during scraping + # Make passthrough to avoid unnecessary memcache puts + fbl.make_passthrough() + + # Discover events from sources + try: + discovered_list = thing_scraper.discover_events_from_sources(fbl, sources) + except Exception as e: + logger.error(f"Error scraping sources: {e}") + self.metrics.increment('batches_failed_scrape') + return + + # Aggregate by event_id + for discovered in discovered_list: + state = ( + discovered.source_id, + discovered.source_field, + discovered.extra_source_id, + ) + self.discovered_events[discovered.event_id].append(state) + self.metrics.increment('events_discovered') + + self.metrics.increment('sources_scraped', len(sources)) + self.metrics.increment('batches_processed') + + def teardown(self) -> None: + """Process all discovered events (reduce phase).""" + logger.info(f"Processing {len(self.discovered_events)} unique events") + + if self.dry_run: + logger.info("[DRY RUN] Would process discovered events") + for event_id, sources in list(self.discovered_events.items())[:10]: + logger.info(f" Event {event_id}: {len(sources)} sources") + return + + if not self.fb_context: + logger.warning("No FB context, skipping event processing") + return + + fbl = self.fb_context.get_fblookup() + fbl.allow_cache = True # Use cache for classification + + # Process events in batches + events_processed = 0 + events_failed = 0 + + for event_id, source_list in self.discovered_events.items(): + try: + # Build discovered event objects + discovered_list = [] + for source_id, source_field, extra_source_id in source_list: + discovered = potential_events.DiscoveredEvent( + event_id, None, source_field, extra_source_id + ) + discovered.source = None + discovered.source_id = source_id + discovered_list.append(discovered) + + # Process through event pipeline + event_pipeline.process_discovered_events(fbl, discovered_list) + events_processed += 1 + + except Exception as e: + logger.error(f"Error processing event {event_id}: {e}") + events_failed += 1 + + self.metrics.increment('events_processed', events_processed) + self.metrics.increment('events_failed', events_failed) + logger.info(f"Processed {events_processed} events, {events_failed} failed") + + +def main( + min_potential_events: int = 0, + dry_run: bool = False, + **kwargs, +) -> None: + """ + Main entry point for the scrape_and_classify job. + + Args: + min_potential_events: Only scrape sources with at least this many potential events + dry_run: If True, don't actually process events + """ + logger.info(f"Starting scrape_and_classify job with min_potential_events={min_potential_events}") + + # Get tokens for Facebook API access + try: + tokens = get_multiple_tokens(token_count=50) + logger.info(f"Got {len(tokens)} access tokens for rotation") + except Exception as e: + logger.warning(f"Could not get multiple tokens: {e}") + tokens = [] + + fb_context = FBJobContext( + fb_uid='system', + access_tokens=tokens, + allow_cache=False, + ) if tokens else None + + job = ScrapeAndClassifyJob( + fb_context=fb_context, + min_potential_events=min_potential_events, + dry_run=dry_run, + ) + set_current_metrics(job.metrics) + + runner = JobRunner(job) + runner.run_from_datastore_batched( + entity_kind='dancedeets.event_scraper.thing_db.Source', + batch_size=20, + ) + + job.metrics.log_summary() + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/send_weekly_emails.py b/server/dancedeets/jobs/send_weekly_emails.py new file mode 100644 index 00000000..7c7f9f30 --- /dev/null +++ b/server/dancedeets/jobs/send_weekly_emails.py @@ -0,0 +1,292 @@ +""" +Cloud Run Job: Send weekly event digest emails to users. + +Migrated from: dancedeets/search/email_events.py + +This job sends a weekly email to users with dance events near them. + +Usage: + python -m dancedeets.jobs.runner --job=send_weekly_emails + python -m dancedeets.jobs.runner --job=send_weekly_emails --dry_run=true +""" + +import datetime +import logging +import random +import re +import urllib.parse +from typing import Optional + +from dancedeets import fb_api +from dancedeets import render_server +from dancedeets.jobs.base import Job, JobRunner +from dancedeets.jobs.fb_utils import FBJobContext, get_multiple_tokens +from dancedeets.jobs.metrics import JobMetrics, set_current_metrics +from dancedeets.loc import names +from dancedeets.logic import api_format +from dancedeets.logic import mobile +from dancedeets.mail import mandrill_api +from dancedeets.search import search_base +from dancedeets.search import search +from dancedeets.users import users + +logger = logging.getLogger(__name__) + + +class NoEmailException(Exception): + """Raised when email cannot be sent for a user.""" + pass + + +def email_for_user(user, fbl, should_send: bool = True): + """ + Generate and optionally send a weekly email for a user. + + Args: + user: User object + fbl: FBLookup instance + should_send: Whether to actually send the email + + Returns: + The email message dict + + Raises: + NoEmailException: If email cannot be sent for various reasons + """ + if not user.send_email: + raise NoEmailException('User has send_email==False') + + email_address = user.email + if not email_address: + raise NoEmailException('User does not have an email') + + # Check if we sent an email recently + if user.weekly_email_send_date: + if user.weekly_email_send_date > datetime.datetime.now() - datetime.timedelta(days=3): + message = f"Skipping user {user.fb_uid} ({user.full_name}) because last weekly email was sent on {user.weekly_email_send_date}" + logger.warning(message) + raise NoEmailException(message) + + fb_user = fbl.fetched_data(fb_api.LookupUser, fbl.fb_uid) + if 'profile' not in fb_user: + raise NoEmailException(f'Could not find LookupUser: {fb_user}') + + user_location = user.location + if not user_location: + raise NoEmailException('User does not have location') + + # Build search query for this week's events + d = datetime.date.today() + start_time = d - datetime.timedelta(days=d.weekday()) # round down to last monday + end_time = start_time + datetime.timedelta(days=8) + data = { + 'location': user_location, + 'distance': user.distance_in_km(), + 'distance_units': 'km', + 'start': start_time, + 'end': end_time, + } + form = search_base.SearchForm(data=data) + + geocode = None + distance = None + if form.location.data: + try: + geocode, distance = search_base.get_geocode_with_distance(form) + except Exception as e: + raise NoEmailException(f'Could not normalize user location: {data}: {e}') + + try: + search_query = form.build_query(start_end_query=True) + except Exception: + logger.error(f'Error looking up user location for user {user.fb_uid}, form: {form}') + raise + + search_results = search.Search(search_query).get_search_results() + if not search_results: + raise NoEmailException('No search results for user') + + # Build the email content + need_full_event = False + json_search_response = api_format.build_search_results_api( + form, search_query, search_results, (2, 0), need_full_event, geocode, distance, skip_people=True + ) + locale = user.locale or 'en_US' + email_unsubscribe_url = f'https://www.dancedeets.com/user/unsubscribe?email={urllib.parse.quote(email_address)}' + props = { + 'user': { + 'userName': user.first_name or user.full_name or '', + 'city': user.city_name, + 'countryName': names.get_country_name(user.location_country), + }, + 'response': json_search_response, + 'currentLocale': locale.replace('_', '-'), + 'mobileIosUrl': mobile.IOS_URL, + 'mobileAndroidUrl': mobile.ANDROID_URL, + 'emailPreferencesUrl': email_unsubscribe_url, + } + + # Render the email template + email_template = 'weeklyMail.js' + response = render_server.render_jsx(email_template, props, static_html=True) + if response.error: + message = f'Error rendering weeklyMail.js: {response.error}' + logger.error(message) + raise NoEmailException(message) + + mjml_response = render_server.render_mjml(response.markup) + rendered_html = mjml_response['html'] + if mjml_response.get('errors'): + message = f'Errors rendering weeklyMail.mjml: {mjml_response["errors"]}' + logger.error(message) + raise NoEmailException(message) + + # Build the message + messages = [ + 'Your Week in Dance: %s', + 'DanceDeets Weekly: %s', + 'Dance Events for %s', + ] + message_template = random.choice(messages) + tag = re.sub(r'[^a-z]', '-', message_template.lower())[:50] + tags = ['weekly', tag] + + subject = message_template % d.strftime('%b %d, %Y') + message = { + 'from_email': 'events@dancedeets.com', + 'from_name': 'DanceDeets Events', + 'subject': subject, + 'to': [{ + 'email': email_address, + 'name': user.full_name or user.first_name or '', + 'type': 'to', + }], + 'html': rendered_html, + 'metadata': { + 'user_id': user.fb_uid, + 'email_type': 'weekly', + }, + 'tags': tags, + } + + if should_send: + logger.info(f'Sending weekly mail for user {user.fb_uid} ({user.full_name})') + # Update the last-sent-time here, so any retryable errors don't cause emails to be multi-sent + user = users.User.get_by_id(user.fb_uid) + user.weekly_email_send_date = datetime.datetime.now() + user.put() + # And send the message now + mandrill_api.send_message(message) + + return message + + +class SendWeeklyEmailsJob(Job): + """ + Job that sends weekly event digest emails to users. + + For each user: + 1. Fetch user profile from Facebook + 2. Search for events near their location + 3. Render and send email via Mandrill + """ + + def __init__( + self, + fb_context: Optional[FBJobContext] = None, + dry_run: bool = False, + ): + super().__init__() + self.fb_context = fb_context + self.dry_run = dry_run + logger.info("SendWeeklyEmailsJob initialized") + + def run(self, user) -> None: + """Process a single user.""" + # Get access token + access_token = user.fb_access_token + if not access_token and self.fb_context: + access_token = self.fb_context.access_token + + if not access_token: + logger.info(f"Skipping user {user.fb_uid} - no access token") + self.metrics.increment('users_skipped_no_token') + return + + # Create FBLookup for this user + fbl = fb_api.FBLookup(user.fb_uid, access_token) + if self.fb_context: + fbl.allow_cache = self.fb_context.allow_cache + + # Fetch user data from Facebook + fbl.request(fb_api.LookupUser, user.fb_uid) + fbl.request(fb_api.LookupUserEvents, user.fb_uid) + + try: + fbl.batch_fetch() + except fb_api.ExpiredOAuthToken as e: + logger.info(f"Auth token now expired for {user.fb_uid}: {e}") + user.expired_oauth_token_reason = str(e.args[0]) if e.args else "Unknown" + user.expired_oauth_token = True + if not self.dry_run: + user.put() + self.metrics.increment('users_token_expired') + return + + # Generate and send email + try: + should_send = not self.dry_run + email_for_user(user, fbl, should_send=should_send) + + if self.dry_run: + logger.info(f"[DRY RUN] Would send email to {user.fb_uid}") + self.metrics.increment('emails_would_send') + else: + self.metrics.increment('emails_sent') + + except NoEmailException as e: + logger.info(f"Not sending email for user {user.fb_uid}: {e}") + self.metrics.increment('users_skipped_no_email') + + except Exception as e: + logger.exception(f"Error sending email for user {user.fb_uid}") + self.metrics.increment('emails_failed') + + +def main(dry_run: bool = False, **kwargs) -> None: + """ + Main entry point for the send_weekly_emails job. + + Args: + dry_run: If True, don't actually send emails + """ + logger.info("Starting send_weekly_emails job") + + # Get tokens for Facebook API access + try: + tokens = get_multiple_tokens(token_count=50) + logger.info(f"Got {len(tokens)} access tokens for rotation") + except Exception as e: + logger.warning(f"Could not get multiple tokens: {e}") + tokens = [] + + # Create FB context with token pool + fb_context = FBJobContext( + fb_uid='system', + access_tokens=tokens, + allow_cache=True, + ) if tokens else None + + job = SendWeeklyEmailsJob(fb_context=fb_context, dry_run=dry_run) + set_current_metrics(job.metrics) + + runner = JobRunner(job) + runner.run_from_datastore( + entity_kind='dancedeets.users.users.User', + ) + + job.metrics.log_summary() + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/start_spiders.py b/server/dancedeets/jobs/start_spiders.py new file mode 100644 index 00000000..b74ffd1d --- /dev/null +++ b/server/dancedeets/jobs/start_spiders.py @@ -0,0 +1,120 @@ +""" +Cloud Run Job: Start ScrapingHub spider jobs. + +Migrated from: dancedeets/classes/class_pipeline.py (start_spiders) + +This job triggers spider crawls on ScrapingHub for dance studio schedules. + +Usage: + python -m dancedeets.jobs.runner --job=start_spiders +""" + +import json +import logging +import os +from typing import List + +import scrapinghub + +from dancedeets import keys +from dancedeets.jobs.metrics import JobMetrics, set_current_metrics + +logger = logging.getLogger(__name__) + +# Spiders that are currently disabled +DISABLED_SPIDERS = ['EXPG', 'Boogiezone', 'IDA', 'mL', 'NeighborhoodStudio'] + + +def get_spiders() -> List[str]: + """Get list of active spiders.""" + all_spiders = [ + # NY + 'PMT', + 'Evolution', + 'Peridance', + 'BDC', + 'EXPG', + # LA + 'Millenium', + 'EDGE', + 'DebbieReynolds', + 'TheLab', + 'Boogiezone', + 'IDA', + 'mL', + 'NeighborhoodStudio', + ] + return [s for s in all_spiders if s not in DISABLED_SPIDERS] + + +def get_shub_project(): + """Get ScrapingHub project connection.""" + api_key = keys.get('scrapinghub_key') + conn = scrapinghub.Connection(api_key) + project = scrapinghub.Project(conn, 27474) + return project + + +def start_spiders(spiders: List[str], dry_run: bool = False) -> List[str]: + """ + Start spider jobs on ScrapingHub. + + Args: + spiders: List of spider names to run + dry_run: If True, don't actually start spiders + + Returns: + List of job keys for started spiders + """ + if dry_run: + logger.info(f"[DRY RUN] Would start {len(spiders)} spiders: {spiders}") + return [f"dry-run-{s}" for s in spiders] + + project = get_shub_project() + job_keys = [] + + for spider in spiders: + try: + job_id = project.schedule(spider) + job_keys.append(job_id) + logger.info(f"Scheduled spider {spider}: {job_id}") + except Exception as e: + logger.error(f"Error scheduling spider {spider}: {e}") + + logger.info(f"Scheduled {len(job_keys)} jobs: {job_keys}") + return job_keys + + +def main(dry_run: bool = False, **kwargs) -> None: + """ + Main entry point for the start_spiders job. + + Args: + dry_run: If True, don't actually start spiders + + Outputs: + Writes job keys to stdout for workflow consumption + """ + logger.info("Starting start_spiders job") + + metrics = JobMetrics() + set_current_metrics(metrics) + + spiders = get_spiders() + logger.info(f"Active spiders: {spiders}") + + job_keys = start_spiders(spiders, dry_run=dry_run) + + metrics.increment('spiders_started', len(job_keys)) + metrics.log_summary() + + # Output job keys for workflow to consume + output = { + 'jobKeys': job_keys, + 'spidersStarted': len(job_keys), + } + print(json.dumps(output)) + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/update_source_stats.py b/server/dancedeets/jobs/update_source_stats.py new file mode 100644 index 00000000..a77e0969 --- /dev/null +++ b/server/dancedeets/jobs/update_source_stats.py @@ -0,0 +1,130 @@ +""" +Cloud Run Job: Update source statistics. + +Migrated from: dancedeets/event_scraper/thing_db.py (mr_count_potential_events) + +This job counts potential events, real events, and false negatives +per source (fan pages, profiles, etc.) for source quality tracking. + +Usage: + python -m dancedeets.jobs.runner --job=update_source_stats +""" + +import json +import logging +from collections import defaultdict +from typing import Dict, List, Optional, Tuple + +from dancedeets.events import eventdata +from dancedeets.event_scraper import thing_db +from dancedeets.jobs.base import Job, JobRunner +from dancedeets.jobs.metrics import JobMetrics, set_current_metrics + +logger = logging.getLogger(__name__) + + +class UpdateSourceStatsJob(Job): + """ + Job that updates source statistics. + + For each potential event, counts: + - Whether it's a potential event (match_score > 0) + - Whether it became a real event (exists in DBEvent) + - Whether it's a false negative (real but not potential) + + Aggregates by source and updates Source entities. + """ + + def __init__(self, dry_run: bool = False): + super().__init__() + self.dry_run = dry_run + + # Aggregate counts per source: source_id -> (potential, real, false_negative, total) + self.source_counts: Dict[str, Dict[str, int]] = defaultdict( + lambda: {'all': 0, 'potential': 0, 'real': 0, 'false_negative': 0} + ) + + logger.info("UpdateSourceStatsJob initialized") + + def run(self, pe) -> None: + """Process a single potential event.""" + # Check if this became a real event + db_event = eventdata.DBEvent.get_by_id(pe.fb_event_id) + + is_potential_event = pe.match_score > 0 + real_event = db_event is not None + false_negative = bool(db_event and not is_potential_event) + + # Aggregate for each source + for source_id in pe.source_ids_only(): + source_id_str = str(source_id) + self.source_counts[source_id_str]['all'] += 1 + if is_potential_event: + self.source_counts[source_id_str]['potential'] += 1 + if real_event: + self.source_counts[source_id_str]['real'] += 1 + if false_negative: + self.source_counts[source_id_str]['false_negative'] += 1 + + self.metrics.increment('events_processed') + + def teardown(self) -> None: + """Update all Source entities with aggregated counts.""" + logger.info(f"Updating {len(self.source_counts)} sources") + + sources_updated = 0 + sources_not_found = 0 + + for source_id, counts in self.source_counts.items(): + source = thing_db.Source.get_by_key_name(source_id) + if not source: + logger.debug(f"Source not found: {source_id}") + sources_not_found += 1 + continue + + if self.dry_run: + logger.debug( + f"[DRY RUN] Would update source {source_id}: " + f"all={counts['all']}, potential={counts['potential']}, " + f"real={counts['real']}, false_negative={counts['false_negative']}" + ) + else: + source.num_all_events = counts['all'] + source.num_potential_events = counts['potential'] + source.num_real_events = counts['real'] + source.num_false_negatives = counts['false_negative'] + source.put() + + sources_updated += 1 + + self.metrics.increment('sources_updated', sources_updated) + self.metrics.increment('sources_not_found', sources_not_found) + + if self.dry_run: + logger.info(f"[DRY RUN] Would update {sources_updated} sources") + else: + logger.info(f"Updated {sources_updated} sources") + + +def main(dry_run: bool = False, **kwargs) -> None: + """ + Main entry point for the update_source_stats job. + + Args: + dry_run: If True, don't update sources + """ + logger.info("Starting update_source_stats job") + + job = UpdateSourceStatsJob(dry_run=dry_run) + set_current_metrics(job.metrics) + + runner = JobRunner(job) + runner.run_from_datastore( + entity_kind='dancedeets.event_scraper.potential_events.PotentialEvent', + ) + + job.metrics.log_summary() + + +if __name__ == '__main__': + main() diff --git a/server/requirements-jobs.txt b/server/requirements-jobs.txt new file mode 100644 index 00000000..f4c12ec5 --- /dev/null +++ b/server/requirements-jobs.txt @@ -0,0 +1,22 @@ +# Requirements for Cloud Run Jobs +# Minimal dependencies for batch processing + +# Google Cloud clients +google-cloud-datastore>=2.15.0 +google-cloud-storage>=2.10.0 +google-cloud-tasks>=2.14.0 + +# Optional: Cloud Monitoring for metrics export +# google-cloud-monitoring>=2.16.0 + +# HTTP requests +requests>=2.31.0 + +# XML processing (for sitemaps) +lxml>=4.9.0 + +# Date/time utilities +python-dateutil>=2.8.2 + +# Environment variables +python-dotenv>=1.0.0 diff --git a/server/workflows/crawl_and_index_classes.yaml b/server/workflows/crawl_and_index_classes.yaml new file mode 100644 index 00000000..e0b1f665 --- /dev/null +++ b/server/workflows/crawl_and_index_classes.yaml @@ -0,0 +1,104 @@ +# Cloud Workflow: Crawl and Index Dance Classes +# +# Migrated from: dancedeets/classes/class_pipeline.py +# +# This workflow orchestrates the dance class scraping pipeline: +# 1. Start spider jobs on ScrapingHub +# 2. Wait for spiders to complete +# 3. Reindex classes in search +# 4. Email any crawl errors +# +# Usage: +# gcloud workflows run crawl-and-index-classes + +main: + params: [args] + steps: + - init: + assign: + - project_id: ${sys.get_env("GOOGLE_CLOUD_PROJECT")} + - region: "us-central1" + - run_time: ${time.format(sys.now())} + + - start_spiders: + call: http.post + args: + url: ${"https://" + region + "-run.googleapis.com/apis/run.googleapis.com/v1/namespaces/" + project_id + "/jobs/start-spiders:run"} + auth: + type: OIDC + result: spider_result + + - get_job_keys: + assign: + - job_keys: ${spider_result.body.jobKeys} + + - wait_for_spiders: + call: wait_for_completion + args: + job_keys: ${job_keys} + max_attempts: 60 + delay_seconds: 30 + result: jobs_completed + + - parallel_finalize: + parallel: + branches: + - reindex: + call: http.post + args: + url: ${"https://" + region + "-run.googleapis.com/apis/run.googleapis.com/v1/namespaces/" + project_id + "/jobs/reindex-classes:run"} + auth: + type: OIDC + - email_errors: + call: http.post + args: + url: ${"https://" + region + "-run.googleapis.com/apis/run.googleapis.com/v1/namespaces/" + project_id + "/jobs/email-crawl-errors:run"} + body: + run_time: ${run_time} + job_keys: ${job_keys} + auth: + type: OIDC + + - return_result: + return: + status: "completed" + run_time: ${run_time} + jobs_completed: ${jobs_completed} + +# Subworkflow: Wait for ScrapingHub jobs to complete +wait_for_completion: + params: [job_keys, max_attempts, delay_seconds] + steps: + - init_wait: + assign: + - attempt: 0 + + - check_jobs: + call: http.get + args: + url: "https://app.scrapinghub.com/api/jobs/list.json" + query: + job: ${job_keys} + auth: + type: OIDC + result: jobs_status + + - evaluate_status: + switch: + - condition: ${all(job.state == "finished" for job in jobs_status.body.jobs)} + next: return_success + - condition: ${attempt >= max_attempts} + raise: "Timeout waiting for spider jobs to complete" + + - increment_attempt: + assign: + - attempt: ${attempt + 1} + + - wait: + call: sys.sleep + args: + seconds: ${delay_seconds} + next: check_jobs + + - return_success: + return: true From ecc537222f12cb59b9abf5b5519e33eaba2d3c9b Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 1 Dec 2025 05:47:34 +0000 Subject: [PATCH 3/3] Remove legacy mapreduce/pipeline code after Cloud Run Jobs migration This cleanup removes the old mapreduce and pipeline code from original files after migrating all batch processing to Cloud Run Jobs. Files cleaned up (mapreduce code removed, core functions retained): - notifications/added_events.py - kept promote_events_to_user() - sitemaps/events.py - kept generate_sitemap_entry() - ml/gprediction.py - kept predict(), get_predict_service() - users/user_event_tasks.py - kept update_user_qualities() - users/user_tasks.py - kept fetch_and_save_fb_user() - search/email_events.py - kept email_for_user() - pubsub/pubsub_tasks.py - kept social handlers - rankings/rankings.py - kept utility functions - event_scraper/auto_add.py - kept classification logic - event_scraper/thing_db.py - kept Source model - event_scraper/thing_scraper2.py - deprecation stub - classes/class_pipeline.py - deprecation stub Files deleted (fully migrated): - logic/mr_dump.py -> jobs/dump_potential_events.py - logic/unique_attendees.py -> jobs/count_unique_attendees.py - ml/mr_prediction.py -> jobs/classify_events_ml.py The compat/ layer is retained with LEGACY_APIS_ENABLED=False for remaining imports that use json_util.JsonProperty and other utilities. --- server/MIGRATION_PLAN.md | 30 +++ server/dancedeets/classes/class_pipeline.py | 166 +++------------ server/dancedeets/event_scraper/auto_add.py | 120 ++++++----- server/dancedeets/event_scraper/thing_db.py | 140 ++----------- .../event_scraper/thing_scraper2.py | 93 ++------ server/dancedeets/logic/mr_dump.py | 48 ----- server/dancedeets/logic/unique_attendees.py | 66 ------ server/dancedeets/ml/gprediction.py | 146 +++++++------ server/dancedeets/ml/mr_prediction.py | 56 ----- .../dancedeets/notifications/added_events.py | 74 ++++--- server/dancedeets/pubsub/pubsub_tasks.py | 42 +--- server/dancedeets/rankings/rankings.py | 198 ++++-------------- server/dancedeets/search/email_events.py | 26 +-- server/dancedeets/sitemaps/events.py | 160 ++++++-------- server/dancedeets/users/user_event_tasks.py | 73 ++++--- server/dancedeets/users/user_tasks.py | 76 +++---- 16 files changed, 476 insertions(+), 1038 deletions(-) delete mode 100644 server/dancedeets/logic/mr_dump.py delete mode 100644 server/dancedeets/logic/unique_attendees.py delete mode 100644 server/dancedeets/ml/mr_prediction.py diff --git a/server/MIGRATION_PLAN.md b/server/MIGRATION_PLAN.md index 126504b4..f1059be1 100644 --- a/server/MIGRATION_PLAN.md +++ b/server/MIGRATION_PLAN.md @@ -11,6 +11,36 @@ This document outlines the migration from legacy App Engine MapReduce/Pipeline t | Phase 3: GCS Output Jobs | ✅ COMPLETE | 5/5 jobs | | Phase 4: MapReduce Pipeline Jobs | ✅ COMPLETE | 3/4 jobs (find_access_tokens pending) | | Phase 5: Cloud Workflows | ✅ COMPLETE | 1 workflow + 3 jobs | +| Phase 6: Code Cleanup | ✅ COMPLETE | Old mapreduce code removed | + +## Cleanup Completed + +The following original files have been cleaned up to remove mapreduce/pipeline code: + +**Files modified (old mapreduce code removed, core functions retained):** +- `notifications/added_events.py` - Kept `promote_events_to_user()`, removed mapreduce handler +- `sitemaps/events.py` - Kept `generate_sitemap_entry()`, removed mapreduce handler +- `ml/gprediction.py` - Kept `predict()`, `get_predict_service()`, removed MR wrappers +- `users/user_event_tasks.py` - Kept `update_user_qualities()`, removed mapreduce handler +- `users/user_tasks.py` - Kept `fetch_and_save_fb_user()`, removed mapreduce handler +- `search/email_events.py` - Kept `email_for_user()`, removed mapreduce wrapper +- `pubsub/pubsub_tasks.py` - Kept social handlers, removed `PostJapanEventsHandler` MR code +- `rankings/rankings.py` - Kept utility functions, removed all mapreduce code +- `event_scraper/auto_add.py` - Kept classification logic, removed MR wrappers (added optional `metrics` param) +- `event_scraper/thing_db.py` - Kept Source model and helpers, removed MR pipeline code +- `event_scraper/thing_scraper2.py` - Replaced with deprecation stub +- `classes/class_pipeline.py` - Replaced with deprecation stub + +**Files deleted (fully migrated to Cloud Run Jobs):** +- `logic/mr_dump.py` → `jobs/dump_potential_events.py` +- `logic/unique_attendees.py` → `jobs/count_unique_attendees.py` +- `ml/mr_prediction.py` → `jobs/classify_events_ml.py` + +**Compat layer status:** +- `compat/` directory retained with `LEGACY_APIS_ENABLED = False` +- Provides stub implementations for imports that still reference mapreduce utilities +- `json_util.JsonProperty` still used by Source model +- Can be removed in future cleanup after all references are updated ### New Files Created diff --git a/server/dancedeets/classes/class_pipeline.py b/server/dancedeets/classes/class_pipeline.py index f3513705..28a068b6 100644 --- a/server/dancedeets/classes/class_pipeline.py +++ b/server/dancedeets/classes/class_pipeline.py @@ -1,150 +1,38 @@ -# class_indexing_pipeline - -import datetime +""" +Class crawling and indexing pipeline. + +This functionality has been migrated to Cloud Workflows and Cloud Run Jobs. +See: +- Workflow: workflows/crawl_and_index_classes.yaml +- Jobs: dancedeets.jobs.start_spiders + dancedeets.jobs.reindex_classes + dancedeets.jobs.email_crawl_errors + +This module is kept for backwards compatibility but the Pipeline +handlers have been removed. Use Cloud Workflows instead. +""" import logging -from dancedeets.compat.pipeline import common -from dancedeets.compat.pipeline import pipeline -import scrapinghub - from dancedeets import app from dancedeets import base_servlet -from dancedeets.classes import class_index -from dancedeets import keys -from dancedeets.mail import mandrill_api -from dancedeets.util import fixed_pipelines - -# TODO(mindbody): -DISABLED_SPIDERS = ['EXPG', 'Boogiezone', 'IDA', 'mL', 'NeighborhoodStudio'] - - -def get_spiders(): - return [ - # NY - 'PMT', - 'Evolution', - 'Peridance', - 'BDC', - 'EXPG', - # LA - 'Millenium', - 'EDGE', - 'DebbieReynolds', - 'TheLab', - 'Boogiezone', - 'IDA', - 'mL', - 'NeighborhoodStudio', - ] - # This depends on Twisted, which depends on zope.interface and lxml. And that whole ball of wax fails when run in the appengine dev sandbox. - # We can't import any of classes/scrapers/ (since it all ultimately depends on scrapy), so there's no great way to get a list of classes. - # Instead, class_pipeline_test does depend on it safely within nosetests, and verifies the above list matches what we get from scrapy's API) - # from scrapy.utils.project import get_project_settings - # from scrapy.crawler import CrawlerRunner - # runner = CrawlerRunner(get_project_settings()) - # return runner.spider_loader.list() - - -def get_shub_project(): - conn = scrapinghub.Connection(keys.get('scrapinghub_key')) - project = scrapinghub.Project(conn, 27474) - return project - - -def start_spiders(spiders): - project = get_shub_project() - job_keys = [] - for spider in spiders: - job_id = project.schedule(spider) - job_keys.append(job_id) - logging.info("Scheduled jobs: %s", job_keys) - return job_keys - - -class CrawlAndIndexClassesJob(fixed_pipelines.Pipeline): - def run(self): - run_time = datetime.datetime.now() - # Find all spiders by looking at modules on disk - spiders = set(get_spiders()).difference(DISABLED_SPIDERS) - - # Trigger new spider jobs on scrapinghub - job_keys = start_spiders(spiders) - - # Wait for crawls to finish - jobs_completed = yield WaitForJobs(job_keys) - - # In parallel, trigger reindex and emailing-of-errors - yield ReindexClasses(job_keys, jobs_completed) - yield EmailErrors(run_time, job_keys, jobs_completed) - - -class WaitForJobs(fixed_pipelines.Pipeline): - def run(self, job_keys): - project = get_shub_project() - jobs = [project.job(x) for x in job_keys] - unfinished = [x for x in jobs if x.info['state'] != 'finished'] - logging.info("Waiting for %s unfinished spiders", len(unfinished)) - if unfinished: - # Try again in 30 seconds - with pipeline.InOrder(): - yield common.Delay(seconds=30) - yield WaitForJobs(job_keys) - else: - yield common.Return(True) - - -class ReindexClasses(fixed_pipelines.Pipeline): - def run(self, job_keys, jobs_completed): - class_index.StudioClassIndex.rebuild_from_query() - - -class EmailErrors(fixed_pipelines.Pipeline): - def run(self, run_time, job_keys, jobs_completed): - project = get_shub_project() - jobs = [project.job(x) for x in job_keys] - - error_lines = {} - - for spider_job in jobs: - if not spider_job.info['items_scraped']: - error_lines.setdefault(spider_job.info['spider'], []).append('Could not find any items.') - - for line in spider_job.log(): - if line['level'] >= 40: - error_lines.setdefault(spider_job.info['spider'], []).append(line['message']) - - if not error_lines: - return - - rendered = ["The following crawl errors occurred:"] - for crawler, errors in error_lines.items(): - rendered += ["%s:" % crawler] - rendered += errors - rendered += [] - - body = '\n'.join(rendered) - logging.warning("%s", body) - - subject = 'Crawl Errors for %s' % run_time.strftime('%b %d, %Y: %H:%M') - message = { - 'from_email': 'reports@dancedeets.com', - 'from_name': 'DanceDeets Reports', - 'subject': subject, - 'to': [{ - 'email': 'reports@dancedeets.com', - 'name': 'DanceDeets Reports', - 'type': 'to', - }], - 'text': body, - } - mandrill_api.send_message(message) @app.route('/tasks/crawl_and_index_classes') class CrawlAndIndexClassesHandler(base_servlet.BaseTaskRequestHandler): + """ + Legacy handler - crawling has been migrated to Cloud Workflows. + + Use Cloud Workflow: crawl_and_index_classes + Or individual jobs: + - python -m dancedeets.jobs.runner --job=start_spiders + - python -m dancedeets.jobs.runner --job=reindex_classes + - python -m dancedeets.jobs.runner --job=email_crawl_errors + """ def get(self): - pipeline = CrawlAndIndexClassesJob() - pipeline.start(queue_name='slow-queue') - self.response.out.write('OK') + logging.warning( + 'This endpoint is deprecated. ' + 'Use Cloud Workflow: crawl_and_index_classes instead.' + ) + self.response.out.write('DEPRECATED: Use Cloud Workflow crawl_and_index_classes instead') post = get diff --git a/server/dancedeets/event_scraper/auto_add.py b/server/dancedeets/event_scraper/auto_add.py index 9667763c..f57899ea 100644 --- a/server/dancedeets/event_scraper/auto_add.py +++ b/server/dancedeets/event_scraper/auto_add.py @@ -1,3 +1,14 @@ +""" +Auto-add event classification logic. + +The batch processing has been migrated to Cloud Run Jobs. +See: dancedeets.jobs.auto_add_events + +This module retains: +- classify_events: Filter and classify potential events +- really_classify_events: Core classification and adding logic +- maybe_add_events: Add events by IDs (for non-batch contexts) +""" import datetime import logging import re @@ -9,50 +20,74 @@ from dancedeets.nlp import event_auto_classifier from dancedeets.nlp import event_classifier from dancedeets.nlp.styles import street -from dancedeets.util import fb_mapreduce -from dancedeets.util import mr from . import add_entities from . import potential_events def is_good_event_by_text(fb_event, classified_event): + """Check if event is a good dance event based on text classification.""" return event_auto_classifier.is_auto_add_event(classified_event).is_good_event() -def classify_events(fbl, pe_list, fb_list): +def classify_events(fbl, pe_list, fb_list, metrics=None): + """ + Filter and classify potential events. + + Args: + fbl: Facebook batch lookup + pe_list: List of PotentialEvent objects + fb_list: List of Facebook event data + metrics: Optional metrics counter (for Cloud Run Jobs) + + Returns: + List of result strings for added events + """ new_pe_list = [] new_fb_list = [] # Go through and find all potential events we actually want to attempt to classify for pe, fb_event in zip(pe_list, fb_list): # Get these past events out of the way, saved, then continue. - # Next time through this mapreduce, we shouldn't need to process them. if pe.set_past_event(fb_event): pe.put() if not fb_event or fb_event['empty']: - mr.increment('skip-due-to-empty') + if metrics: + metrics.increment('skip-due-to-empty') continue # Don't process events we've already looked at, or don't need to look at. - # This doesn't happen with the mapreduce that pre-filters them out, - # but it does happen when we scrape users potential events and throw them all in here. if pe.looked_at: logging.info('Already looked at event (added, or manually discarded), so no need to re-process.') - mr.increment('skip-due-to-looked-at') + if metrics: + metrics.increment('skip-due-to-looked-at') continue event_id = pe.fb_event_id if not re.match(r'^\d+$', event_id): logging.error('Found a very strange potential event id: %s', event_id) - mr.increment('skip-due-to-bad-id') + if metrics: + metrics.increment('skip-due-to-bad-id') continue new_pe_list.append(pe) new_fb_list.append(fb_event) - return really_classify_events(fbl, new_pe_list, new_fb_list) + return really_classify_events(fbl, new_pe_list, new_fb_list, metrics=metrics) + + +def really_classify_events(fbl, new_pe_list, new_fb_list, allow_posting=True, metrics=None): + """ + Core classification logic - classify and add dance events. + Args: + fbl: Facebook batch lookup + new_pe_list: List of PotentialEvent objects + new_fb_list: List of Facebook event data + allow_posting: Whether to post to social media + metrics: Optional metrics counter (for Cloud Run Jobs) -def really_classify_events(fbl, new_pe_list, new_fb_list, allow_posting=True): + Returns: + List of result strings for added events + """ if not new_pe_list: new_pe_list = [None] * len(new_fb_list) logging.info('Filtering out already-added events and others, have %s remaining events to run the classifier on', len(new_fb_list)) @@ -97,20 +132,19 @@ def really_classify_events(fbl, new_pe_list, new_fb_list, allow_posting=True): pe2.looked_at = True pe2.auto_looked_at = True pe2.put() - # TODO(lambert): handle un-add-able events differently results.append(result) - mr.increment('auto-added-dance-events') - if e.start_time < datetime.datetime.now(): - mr.increment('auto-added-dance-events-past') - # mr.increment('auto-added-dance-events-past-eventid-%s' % event_id) + if metrics: + metrics.increment('auto-added-dance-events') + if e.start_time < datetime.datetime.now(): + metrics.increment('auto-added-dance-events-past') + for vertical in e.verticals: + metrics.increment('auto-added-dance-event-past-vertical-%s' % vertical) + else: + metrics.increment('auto-added-dance-events-future') + for vertical in e.verticals: + metrics.increment('auto-added-dance-event-future-vertical-%s' % vertical) for vertical in e.verticals: - mr.increment('auto-added-dance-event-past-vertical-%s' % vertical) - else: - mr.increment('auto-added-dance-events-future') - for vertical in e.verticals: - mr.increment('auto-added-dance-event-future-vertical-%s' % vertical) - for vertical in e.verticals: - mr.increment('auto-added-dance-event-vertical-%s' % vertical) + metrics.increment('auto-added-dance-event-vertical-%s' % vertical) except fb_api.NoFetchedDataException as e: logging.error("Error adding event %s, no fetched data: %s", event_id, e) except add_entities.AddEventException as e: @@ -118,39 +152,19 @@ def really_classify_events(fbl, new_pe_list, new_fb_list, allow_posting=True): return results -def classify_events_with_yield(fbl, pe_list): - fb_list = fbl.get_multi(fb_api.LookupEvent, [x.fb_event_id for x in pe_list], allow_fail=True) - results = classify_events(fbl, pe_list, fb_list) - yield ''.join(results).encode('utf-8') - - -map_classify_events = fb_mapreduce.mr_wrap(classify_events_with_yield) - +def maybe_add_events(fbl, event_ids): + """ + Attempt to add events by their IDs. -def mr_classify_potential_events(fbl, past_event, dancey_only): - filters = [] - if dancey_only: - filters.append(('should_look_at', '=', True)) - if past_event is not None: - filters.append(('past_event', '=', past_event)) - fb_mapreduce.start_map( - fbl, - 'Auto-Add Events', - 'dancedeets.event_scraper.auto_add.map_classify_events', - 'dancedeets.event_scraper.potential_events.PotentialEvent', - filters=filters, - # Make sure we don't process so many that we cause the tasks to time out - handle_batch_size=10, - queue='fast-queue', - output_writer_spec='mapreduce.output_writers.GoogleCloudStorageOutputWriter', - output_writer={ - 'mime_type': 'text/plain', - 'bucket_name': 'dancedeets-hrd.appspot.com', - }, - ) + Used for non-batch contexts where we have specific event IDs to check. + Args: + fbl: Facebook batch lookup + event_ids: List of Facebook event IDs -def maybe_add_events(fbl, event_ids): + Returns: + List of result strings for added events + """ fb_events = fbl.get_multi(fb_api.LookupEvent, event_ids) empty_ids = [eid for x, eid in zip(fb_events, event_ids) if x['empty']] logging.info('Found empty ids: %s', empty_ids) diff --git a/server/dancedeets/event_scraper/thing_db.py b/server/dancedeets/event_scraper/thing_db.py index 58d8afc4..5e3b03ae 100644 --- a/server/dancedeets/event_scraper/thing_db.py +++ b/server/dancedeets/event_scraper/thing_db.py @@ -1,17 +1,24 @@ +""" +Source entity management for Facebook pages/groups/profiles. + +The batch source statistics computation has been migrated to Cloud Run Jobs. +See: dancedeets.jobs.update_source_stats + +This module retains: +- Source model: Datastore entity for FB sources +- create_source_from_id: Create/update source from FB ID +- create_sources_from_event: Extract sources from event admins/owners +- Helper functions for FB source type detection +""" import datetime -import json import logging from google.appengine.ext import db from dancedeets.compat.mapreduce import json_util -from dancedeets.compat.mapreduce import mapreduce_pipeline -from dancedeets.compat.mapreduce import operation -from dancedeets.events import eventdata from dancedeets import fb_api from dancedeets.loc import gmaps_api from dancedeets.logic import backgrounder -from dancedeets.util import fb_mapreduce GRAPH_TYPE_PROFILE = 'GRAPH_TYPE_PROFILE' GRAPH_TYPE_FANPAGE = 'GRAPH_TYPE_FANPAGE' @@ -25,9 +32,7 @@ GRAPH_TYPE_GROUP, ] -# Start small -# Only set of sources with walls, and only hand-curated sources (or events). not grabbing new peoples yet. - +# Field types for source scraping FIELD_FEED = 'FIELD_FEED' # /feed FIELD_EVENTS = 'FIELD_EVENTS' # /events FIELD_INVITES = 'FIELD_INVITES' # fql query on invites for signed-up users @@ -35,6 +40,7 @@ class Source(db.Model): + """Represents a Facebook source (page, group, profile) for event discovery.""" graph_id = property(lambda x: str(x.key().name())) graph_type = db.StringProperty(choices=GRAPH_TYPES) @@ -50,8 +56,7 @@ class Source(db.Model): verticals = db.ListProperty(str, indexed=True) - # probably to assume for a given event? rough weighting factor? - # do we want to delete these now? + # Style weighting factors (legacy) freestyle = db.FloatProperty(indexed=False) choreo = db.FloatProperty(indexed=False) @@ -60,6 +65,7 @@ class Source(db.Model): creation_time = db.DateTimeProperty(indexed=False, auto_now_add=True) last_scrape_time = db.DateTimeProperty(indexed=False) + # Statistics (updated by Cloud Run Job: update_source_stats) num_all_events = db.IntegerProperty(indexed=False) num_potential_events = db.IntegerProperty(indexed=False) num_real_events = db.IntegerProperty(indexed=False) @@ -77,7 +83,6 @@ def fraction_potential_are_real(self, bias=1): def fraction_real_are_false_negative(self, bias=1): if self.num_real_events: - #TODO(lambert): figure out why num_false_negatives is None, in particular for source id=107687589275667 even after saving num_false_negatives = (self.num_false_negatives or 0) + bias num_real_events = (self.num_real_events or 0) + bias return 1.0 * num_false_negatives / num_real_events @@ -88,22 +93,19 @@ def compute_derived_properties(self, fb_source_common, fb_source_data): if fb_source_common['empty']: # only update these when we have feed data self.fb_info = {} else: - self.fb_info = fb_source_data['info'] # LookupThing* (and all fb_info dependencies). Only used for /search_pages functionality + self.fb_info = fb_source_data['info'] self.graph_type = _type_for_fb_source(fb_source_common) if 'name' not in fb_source_common['info']: logging.error('cannot find name for fb event data: %s, cannot update source data...', fb_source_common) return self.name = fb_source_common['info']['name'] self.emails = fb_source_data['info'].get('emails', []) - if not self.emails: - pass # TODO: trigger basic crawl of website to search for emails feed = fb_source_common['feed']['data'] if len(feed): dt = datetime.datetime.strptime(feed[-1]['created_time'], '%Y-%m-%dT%H:%M:%S+0000') td = datetime.datetime.now() - dt total_seconds = td.seconds + td.days * 24 * 3600 self.feed_history_in_seconds = total_seconds - #logging.info('feed time delta is %s', self.feed_history_in_seconds) else: self.feed_history_in_seconds = 0 location = fb_source_data['info'].get('location') @@ -118,10 +120,10 @@ def compute_derived_properties(self, fb_source_common, fb_source_data): geocode = gmaps_api.lookup_address(address) if geocode: self.latitude, self.longitude = geocode.latlng() - #TODO(lambert): at some point we need to calculate all potential events, and all real events, and update the numbers with values from them. and all fake events. we have a problem where a new source gets added, adds in the potential events and/or real events, but doesn't properly tally them all. can fix this one-off, but it's too-late now, and i imagine our data will grow inaccurate over time anyway. def link_for_fb_source(data): + """Generate Facebook URL for a source.""" if 'link' in data['info']: return data['info']['link'] elif 'version' in data['info']: @@ -133,6 +135,7 @@ def link_for_fb_source(data): def _type_for_fb_source(fb_source_common): + """Determine graph type from FB metadata.""" source_type = fb_source_common['metadata']['metadata']['type'] if source_type == 'page': return GRAPH_TYPE_FANPAGE @@ -148,6 +151,7 @@ def _type_for_fb_source(fb_source_common): def get_lookup_for_graph_type(graph_type): + """Get the appropriate FB API lookup type for a graph type.""" if graph_type == GRAPH_TYPE_FANPAGE: return fb_api.LookupThingPage elif graph_type == GRAPH_TYPE_GROUP: @@ -160,18 +164,19 @@ def get_lookup_for_graph_type(graph_type): def create_source_from_id(fbl, source_id, verticals=None): + """Create or update a Source from a Facebook ID.""" source = create_source_from_id_without_saving(fbl, source_id, verticals=verticals) if source: new_source = (not source.creation_time) source.put() if new_source: - # It seems some "new" sources are existing sources without a creation_time set, so let's force-set it here source.creation_time = datetime.datetime.now() backgrounder.load_sources([source_id], fb_uid=fbl.fb_uid) return source def create_source_from_id_without_saving(fbl, source_id, verticals=None): + """Create a Source object without saving to Datastore.""" logging.info('create_source_from_id: %s', source_id) if not source_id: return None @@ -184,8 +189,6 @@ def create_source_from_id_without_saving(fbl, source_id, verticals=None): original_allow_cache = fbl.allow_cache fbl.allow_cache = True try: - - # technically we could check if the object exists in the db, before we bother fetching the feed fb_source_common = fbl.get(fb_api.LookupThingCommon, source_id) if fb_source_common['empty']: logging.error('Error loading Common Fields for Source: %s', source_id) @@ -210,106 +213,9 @@ def create_source_from_id_without_saving(fbl, source_id, verticals=None): def create_sources_from_event(fbl, db_event): + """Create Source entities from an event's owner and admins.""" logging.info('create_sources_from_event: %s', db_event.id) create_source_from_id(fbl, db_event.owner_fb_uid, verticals=db_event.verticals) for admin in db_event.admins: if admin['id'] != db_event.owner_fb_uid: create_source_from_id(fbl, admin['id'], verticals=db_event.verticals) - - -map_create_sources_from_event = fb_mapreduce.mr_wrap(create_sources_from_event) - - -def explode_per_source_count(pe): - db_event = eventdata.DBEvent.get_by_id(pe.fb_event_id) - - is_potential_event = pe.match_score > 0 - real_event = db_event != None - false_negative = bool(db_event and not is_potential_event) - result = (is_potential_event, real_event, false_negative) - - for source_id in pe.source_ids_only(): - yield (source_id, json.dumps(result)) - - -def combine_source_count(source_id, counts_to_sum): - s = Source.get_by_key_name(source_id) - if not s: - return - - s.num_all_events = 0 - s.num_potential_events = 0 - s.num_real_events = 0 - s.num_false_negatives = 0 - - for result in counts_to_sum: - (potential_event, real_event, false_negative) = json.loads(result) - s.num_all_events += 1 - if potential_event: - s.num_potential_events += 1 - if real_event: - s.num_real_events += 1 - if false_negative: - s.num_false_negatives += 1 - yield operation.db.Put(s) - - -def mr_count_potential_events(fbl, queue): - mapper_params = { - 'entity_kind': 'dancedeets.event_scraper.potential_events.PotentialEvent', - } - mapper_params.update(fb_mapreduce.get_fblookup_params(fbl)) - pipeline = mapreduce_pipeline.MapreducePipeline( - 'clean source counts', - 'dancedeets.event_scraper.thing_db.explode_per_source_count', - 'dancedeets.event_scraper.thing_db.combine_source_count', - 'mapreduce.input_readers.DatastoreInputReader', - None, - mapper_params=mapper_params, - ) - pipeline.start(queue_name=queue) - - -""" -user: -- invited-events fql (event, if member) -- friends (user, if member) -- events (event) -- wall (event, user, page, group) -- likes (page) -- groups (group) - -fanpage: -- wall (event, user, page, group) -- likes (page) -- events (event) -- groups (group) - -event: -- wall (event, user, page, group) -- attending (user) -- creator (user) - -group: -- wall (event, user, page, group) -- members (user) - -Known Dancer Entities (profiles, fan pages, events, groups) -- scrape them for events -- track in each entity, how many events were found on wall, events -- track total-time-of-wall so we know refresh frequency - -status: -dance-related, scrape, add everything in here to "maybe" list -maybe-dance-related, scrape but only return high-quality events, don't scrape for anything-but-events -not-dance-related, don't scrape -old (event), no longer scrape, happens after event has passed - -status set periodically in all-out-mapreduce -- old events stay old -- sources stay dance-related if manually set -- sources become dance-related if they find dance events via it -- sources become not-dance-related if there are no dance events on it after a month or two? or if number of dancer-friends is <20? - -- also want to track how many pages/groups were found via this entity -""" diff --git a/server/dancedeets/event_scraper/thing_scraper2.py b/server/dancedeets/event_scraper/thing_scraper2.py index 3197971f..91ec54e7 100644 --- a/server/dancedeets/event_scraper/thing_scraper2.py +++ b/server/dancedeets/event_scraper/thing_scraper2.py @@ -1,83 +1,28 @@ -import json -import logging +""" +Source scraping and event processing. + +This functionality has been migrated to Cloud Run Jobs. +See: dancedeets.jobs.scrape_and_classify -from dancedeets.compat.mapreduce import mapreduce_pipeline -from dancedeets.util import fb_mapreduce +This module is kept for backwards compatibility but the mapreduce +handlers have been removed. Use the Cloud Run Job instead. +""" +import logging from dancedeets import app from dancedeets import base_servlet -from dancedeets.util import mr -from . import event_pipeline -from . import potential_events -from . import thing_scraper - - -def scrape_sources_for_events(sources): - fbl = fb_mapreduce.get_fblookup() - fbl.allow_cache = False - # Eliminate all caches (both fetching, and saving!) - # This should save on a bunch of unnecessary put() calls while scraping - # (Current estimates are 30qps * 60 sec/min * 50min * $0.18/10K Queries * 30 days = $48/month) - fbl.make_passthrough() - discovered_list = thing_scraper.discover_events_from_sources(fbl, sources) - for x in discovered_list: - state = (x.event_id, x.source_id, x.source_field, x.extra_source_id) - mr.increment('found-event-to-check') - # Don't "shard" events....just group them by id. - # And let the functionality of them sharing sources happen naturally - yield (x.event_id, json.dumps(state)) - - -def process_events(event_id, via_sources): - fbl = fb_mapreduce.get_fblookup() - fbl.allow_cache = True - discovered_list = [] - logging.info('Running process_events with %s event-sources', len(via_sources)) - for data in via_sources: - event_id, source_id, source_field, extra_source_id = json.loads(data) - discovered = potential_events.DiscoveredEvent(event_id, None, source_field, extra_source_id) - discovered.source = None # TODO: This will come back to bite us I'm sure :( - discovered.source_id = source_id - discovered_list.append(discovered) - # Some of these are newly-discovered events, some of these are already-cached and classified. - # TODO: Filter out the already-classified ones, so we don't waste time re-classifying on cached on data. - event_pipeline.process_discovered_events(fbl, discovered_list) @app.route('/tasks/scrape_sources_and_process_events') class LoadPotentialEventsFromWallPostsHandler(base_servlet.BaseTaskFacebookRequestHandler): - def get(self): - min_potential_events = int(self.request.get('min_potential_events', '0')) - queue = self.request.get('queue', 'slow-queue') - mapreduce_scrape_sources_and_process_events(self.fbl, min_potential_events=min_potential_events, queue=queue) - + """ + Legacy handler - scraping has been migrated to Cloud Run Jobs. -def mapreduce_scrape_sources_and_process_events(fbl, min_potential_events, queue): - mapper_params = { - 'entity_kind': 'dancedeets.event_scraper.thing_db.Source', - 'min_potential_events': min_potential_events, - 'handle_batch_size': 20, - } - reducer_params = { - 'output_writer': { - 'bucket_name': 'dancedeets-hrd.appspot.com', - 'content_type': 'text/plain', - } - } - fb_params = fb_mapreduce.get_fblookup_params(fbl, randomize_tokens=True) - mapper_params.update(fb_params) - reducer_params.update(fb_params) - - # output = yield ... - pipeline = mapreduce_pipeline.MapreducePipeline( - 'Scrape sources, then load and classify the events', - 'dancedeets.event_scraper.thing_scraper2.scrape_sources_for_events', - 'dancedeets.event_scraper.thing_scraper2.process_events', - 'mapreduce.input_readers.DatastoreInputReader', - 'mapreduce.output_writers.GoogleCloudStorageOutputWriter', - mapper_params=mapper_params, - reducer_params=reducer_params, - shards=16, - ) - - pipeline.start(queue_name=queue) + Use: python -m dancedeets.jobs.runner --job=scrape_and_classify + """ + def get(self): + logging.warning( + 'This endpoint is deprecated. ' + 'Use Cloud Run Job: dancedeets.jobs.scrape_and_classify instead.' + ) + self.response.out.write('DEPRECATED: Use Cloud Run Job scrape_and_classify instead') diff --git a/server/dancedeets/logic/mr_dump.py b/server/dancedeets/logic/mr_dump.py deleted file mode 100644 index 5c265a43..00000000 --- a/server/dancedeets/logic/mr_dump.py +++ /dev/null @@ -1,48 +0,0 @@ -import csv -import io -import json -import logging - -from dancedeets import fb_api -from dancedeets.util import fb_mapreduce - - -def dump_fb_json(fbl, pe_list): - pe_list = [x for x in pe_list if x.match_score > 0] - if not pe_list: - return - - fbl.request_multi(fb_api.LookupEvent, [x.fb_event_id for x in pe_list]) - fbl.batch_fetch() - - csv_file = io.StringIO() - csv_writer = csv.writer(csv_file) - - for pe in pe_list: - try: - result = json.dumps(fbl.fetched_data(fb_api.LookupEvent, pe.fb_event_id)) - cache_key = fbl.key_to_cache_key(fb_api.generate_key(fb_api.LookupEvent, pe.fb_event_id)) - csv_writer.writerow([cache_key, result]) - except fb_api.NoFetchedDataException: - logging.error("skipping row for event id %s", pe.fb_event_id) - yield csv_file.getvalue() - - -map_dump_fb_json = fb_mapreduce.mr_wrap(dump_fb_json) - - -def mr_dump_events(fbl): - fb_mapreduce.start_map( - fbl, - 'Dump Potential FB Event Data', - 'dancedeets.logic.mr_dump.map_dump_fb_json', - 'dancedeets.event_scraper.potential_events.PotentialEvent', - handle_batch_size=80, - queue=None, - filters=[('looked_at', '=', None)], - output_writer_spec='mapreduce.output_writers.GoogleCloudStorageOutputWriter', - output_writer={ - 'mime_type': 'text/plain', - 'bucket_name': 'dancedeets-hrd.appspot.com', - }, - ) diff --git a/server/dancedeets/logic/unique_attendees.py b/server/dancedeets/logic/unique_attendees.py deleted file mode 100644 index ca9e96bc..00000000 --- a/server/dancedeets/logic/unique_attendees.py +++ /dev/null @@ -1,66 +0,0 @@ -import logging - -from dancedeets.compat.mapreduce import mapreduce_pipeline - -from dancedeets import app -from dancedeets import base_servlet -from dancedeets import fb_api -from dancedeets.util import fb_mapreduce - -BATCH_SIZE = 20 - - -def map_each_attendee(db_events): - db_events = [x for x in db_events if x.is_fb_event] - - fbl = fb_mapreduce.get_fblookup() - fbl.request_multi(fb_api.LookupEventAttending, [x.fb_event_id for x in db_events]) - fbl.batch_fetch() - - for db_event in db_events: - try: - fb_event_attending = fbl.fetched_data(fb_api.LookupEventAttending, db_event.id) - except fb_api.NoFetchedDataException: - logging.warning('No attending found for %s', db_event.id) - continue - if fb_event_attending['empty']: - continue - for attendee in fb_event_attending['attending']['data']: - yield ('City: %s' % db_event.city_name, attendee['id']) - yield ('Country: %s' % db_event.country, attendee['id']) - - -def reduce_just_unique_attendees(location, all_attendees): - yield 'Unique Attendees in %s: %s\n' % (location, len(set(all_attendees))) - yield 'Total RSVPs in %s: %s\n' % (location, len(all_attendees)) - - -def mr_count_attendees_per_city(fbl): - mapper_params = { - 'entity_kind': 'dancedeets.events.eventdata.DBEvent', - 'handle_batch_size': BATCH_SIZE, - } - mapper_params.update(fb_mapreduce.get_fblookup_params(fbl, randomize_tokens=True)) - mrp = mapreduce_pipeline.MapreducePipeline( - 'unique_attendees', - 'dancedeets.logic.unique_attendees.map_each_attendee', - 'dancedeets.logic.unique_attendees.reduce_just_unique_attendees', - 'mapreduce.input_readers.DatastoreInputReader', - 'mapreduce.output_writers.GoogleCloudStorageOutputWriter', - mapper_params=mapper_params, - reducer_params={ - 'output_writer': { - 'mime_type': 'text/plain', - 'bucket_name': 'dancedeets-hrd.appspot.com', - }, - }, - shards=8, - ) - mrp.start() - return mrp - - -@app.route('/tools/unique_attendees') -class ExportSourcesHandler(base_servlet.BaseTaskFacebookRequestHandler): - def get(self): - mr_count_attendees_per_city(self.fbl) diff --git a/server/dancedeets/ml/gprediction.py b/server/dancedeets/ml/gprediction.py index 1b89475b..74356842 100644 --- a/server/dancedeets/ml/gprediction.py +++ b/server/dancedeets/ml/gprediction.py @@ -1,105 +1,86 @@ -import csv -import io +""" +Google Prediction API integration for event classification. + +The batch processing jobs have been migrated to Cloud Run Jobs. +See: +- dancedeets.jobs.generate_training_data +- dancedeets.jobs.classify_events_ml + +This module retains: +- get_predict_service(): Google Prediction API client +- get_training_features(): Feature extraction for ML +- predict(): Single event prediction +""" import logging import string -from dancedeets.events import eventdata from dancedeets.events import event_locations -from dancedeets import fb_api -from dancedeets.util import fb_mapreduce convert_chars = string.punctuation + '\r\n\t' trans = str.maketrans(convert_chars, ' ' * len(convert_chars)) def strip_punctuation(s): + """Remove punctuation from a string.""" return s.translate(trans) -def training_data_for_pevents(fbl, pevents): - fbl.allow_memcache_write = False # don't pollute memcache - fb_event_ids = [x.fb_event_id for x in pevents if x.looked_at] - fbl.request_multi(fb_api.LookupEvent, fb_event_ids) - fbl.request_multi(fb_api.LookupEventAttending, fb_event_ids) - fbl.batch_fetch() - - good_event_ids = [x.fb_event_id for x in eventdata.DBEvent.get_by_ids(fb_event_ids, keys_only=True) if x] - - csv_file = io.StringIO() - csv_writer = csv.writer(csv_file) - - for potential_event in pevents: - if not potential_event.looked_at: - continue - try: - good_event = potential_event.fb_event_id in good_event_ids and 'dance' or 'nodance' - - fb_event = fbl.fetched_data(fb_api.LookupEvent, potential_event.fb_event_id) - if fb_event['empty']: - continue - fb_event_attending = fbl.fetched_data(fb_api.LookupEventAttending, potential_event.fb_event_id) - - training_features = get_training_features(potential_event, fb_event, fb_event_attending) - csv_writer.writerow([good_event] + list(training_features)) - except fb_api.NoFetchedDataException: - logging.info("No data fetched for event id %s", potential_event.fb_event_id) - yield csv_file.getvalue() - - -map_training_data_for_pevents = fb_mapreduce.mr_wrap(training_data_for_pevents) +def get_training_features(potential_event, fb_event, fb_event_attending): + """ + Extract training features from an event. + Args: + potential_event: PotentialEvent instance + fb_event: Facebook event data + fb_event_attending: Facebook event attending data -def get_training_features(potential_event, fb_event, fb_event_attending): + Returns: + Tuple of feature values + """ if 'owner' in fb_event['info']: owner_name = 'id%s' % fb_event['info']['owner']['id'] else: owner_name = '' - location = event_locations.get_address_for_fb_event(fb_event).encode('utf-8') + location = event_locations.get_address_for_fb_event(fb_event) + if isinstance(location, str): + location = location.encode('utf-8') def strip_text(s): - return strip_punctuation(s.encode('utf8')).lower() + if isinstance(s, str): + s = s.encode('utf8') + return strip_punctuation(s.decode('utf8') if isinstance(s, bytes) else s).lower() name = strip_text(fb_event['info'].get('name', '')) description = strip_text(fb_event['info'].get('description', '')) - attendee_list = ' '.join(['id%s' % x['id'] for x in fb_event_attending['attending']['data']]) + attending_data = fb_event_attending.get('attending', {}).get('data', []) + attendee_list = ' '.join(['id%s' % x['id'] for x in attending_data]) source_list = ' '.join('id%s' % x.id for x in potential_event.source_ids_only()) - #TODO(lambert): maybe include number-of-keywords and keyword-density? - - #TODO(lambert): someday write this as a proper mapreduce that reduces across languages and builds a classifier model per language? - # for now we can just grep and build sub-models per-language on my client machine. + # Currently only returning attendee_list (other features commented out in original) return (attendee_list,) - return (potential_event.language, owner_name, location, name, description, attendee_list, source_list) - - -def mr_generate_training_data(fbl): - fb_mapreduce.start_map( - fbl=fbl, - name='Write Training Data', - handler_spec='dancedeets.ml.gprediction.map_training_data_for_pevents', - output_writer_spec='mapreduce.output_writers.GoogleCloudStorageOutputWriter', - handle_batch_size=20, - entity_kind='dancedeets.event_scraper.potential_events.PotentialEvent', - output_writer={ - 'mime_type': 'text/plain', - 'bucket_name': 'dancedeets-hrd.appspot.com', - }, - queue=None, - ) + # Full features would be: + # return (potential_event.language, owner_name, location, name, description, attendee_list, source_list) MAGIC_USER_ID = '100529355548393795594' def get_predict_service(): - #TODO(lambert): we need to cache this somehow, if we use this, since it appears to not even use memcache for credentials. + """ + Get the Google Prediction API service client. + + Note: This uses OAuth credentials stored in Datastore. + """ + # TODO(lambert): we need to cache this somehow import httplib2 from apiclient.discovery import build from oauth2client import appengine - credentials = appengine.StorageByKeyName(appengine.CredentialsModel, MAGIC_USER_ID, 'credentials').get() + credentials = appengine.StorageByKeyName( + appengine.CredentialsModel, MAGIC_USER_ID, 'credentials' + ).get() http = credentials.authorize(httplib2.Http()) service = build("prediction", "v1.5", http=http) @@ -112,15 +93,46 @@ def get_predict_service(): def predict(potential_event, fb_event, fb_event_attending, service=None): - body = {'input': {'csvInstance': get_training_features(potential_event, fb_event, fb_event_attending)}} + """ + Predict whether an event is a dance event. + + Args: + potential_event: PotentialEvent instance + fb_event: Facebook event data + fb_event_attending: Facebook event attending data + service: Optional prediction service (will be created if not provided) + + Returns: + Tuple of (dance_bias_score, not_dance_bias_score) + """ + body = { + 'input': { + 'csvInstance': get_training_features( + potential_event, fb_event, fb_event_attending + ) + } + } logging.info("Dance Data: %r", body) + service = service or get_predict_service() train = service.trainedmodels() + dance_bias_prediction = train.predict(body=body, id=DANCE_BIAS_MODEL_NAME).execute() - dance_bias_score = [x['score'] for x in dance_bias_prediction['outputMulti'] if x['label'] == 'dance'][0] - not_dance_bias_prediction = train.predict(body=body, id=NOT_DANCE_BIAS_MODEL_NAME).execute() - not_dance_bias_score = [x['score'] for x in not_dance_bias_prediction['outputMulti'] if x['label'] == 'dance'][0] + dance_bias_score = [ + x['score'] for x in dance_bias_prediction['outputMulti'] + if x['label'] == 'dance' + ][0] + + not_dance_bias_prediction = train.predict( + body=body, id=NOT_DANCE_BIAS_MODEL_NAME + ).execute() + not_dance_bias_score = [ + x['score'] for x in not_dance_bias_prediction['outputMulti'] + if x['label'] == 'dance' + ][0] + logging.info("Dance Result: %s", dance_bias_prediction) logging.info("NoDance Result: %s", not_dance_bias_prediction) logging.info("Dance Score: %s, NoDance Score: %s", dance_bias_score, not_dance_bias_score) + return dance_bias_score, not_dance_bias_score diff --git a/server/dancedeets/ml/mr_prediction.py b/server/dancedeets/ml/mr_prediction.py deleted file mode 100644 index 6cb85cec..00000000 --- a/server/dancedeets/ml/mr_prediction.py +++ /dev/null @@ -1,56 +0,0 @@ -import logging - -from dancedeets import fb_api - -from dancedeets.event_scraper import potential_events -from . import gprediction -from dancedeets.util import fb_mapreduce - - -def classify_events(fbl, pe_list): - pe_list = [x for x in pe_list if x.match_score > 0] - if not pe_list: - return - predict_service = None - pe_ids = [x.fb_event_id for x in pe_list if not getattr(x, 'dance_bias_score')] - fbl.request_multi(fb_api.LookupEvent, pe_ids) - fbl.request_multi(fb_api.LookupEventAttending, pe_ids) - fbl.batch_fetch() - - results = [] - for pe in pe_list: - if not getattr(pe, 'dance_bias_score'): - try: - fb_event = fbl.fetched_data(fb_api.LookupEvent, pe.fb_event_id) - fb_event_attending = fbl.fetched_data(fb_api.LookupEventAttending, pe.fb_event_id) - except fb_api.NoFetchedDataException: - continue - if fb_event['empty']: - continue - predict_service = predict_service or gprediction.get_predict_service() - pe = potential_events.update_scores_for_potential_event(pe, fb_event, fb_event_attending, predict_service) - logging.info("%s has ms=%s, d=%s, nd=%s", pe.fb_event_id, pe.match_score, pe.dance_bias_score, pe.non_dance_bias_score) - if pe.dance_bias_score > 0.5 and pe.non_dance_bias_score > 0.5: - result = '%s:%s:%s:%s\n' % (pe.fb_event_id, pe.match_score, pe.dance_bias_score, pe.non_dance_bias_score) - results.append(result) - yield ''.join(results).encode('utf-8') - - -map_classify_events = fb_mapreduce.mr_wrap(classify_events) - - -def mr_classify_potential_events(fbl): - fb_mapreduce.start_map( - fbl, - 'Auto-Classify Events', - 'dancedeets.ml.mr_prediction.map_classify_events', - 'dancedeets.event_scraper.potential_events.PotentialEvent', - filters=[('looked_at', '=', None)], - handle_batch_size=20, - queue='slow-queue', - output_writer_spec='mapreduce.output_writers.GoogleCloudStorageOutputWriter', - output_writer={ - 'mime_type': 'text/plain', - 'bucket_name': 'dancedeets-hrd.appspot.com', - }, - ) diff --git a/server/dancedeets/notifications/added_events.py b/server/dancedeets/notifications/added_events.py index 03c1351e..92fe23fe 100644 --- a/server/dancedeets/notifications/added_events.py +++ b/server/dancedeets/notifications/added_events.py @@ -1,9 +1,17 @@ +""" +User event notifications. + +The main batch processing has been migrated to Cloud Run Jobs. +See: dancedeets.jobs.notify_users + +This module retains: +- promote_events_to_user(): Core notification logic (used by jobs and dev handler) +- /tasks/promote_new_events_to_user: Dev handler for testing single user +""" import datetime import logging import time -from dancedeets.compat.mapreduce import control - from dancedeets import app from dancedeets import base_servlet from dancedeets.loc import gmaps_api @@ -12,13 +20,10 @@ from dancedeets.search import search_base from dancedeets.users import users from . import android -""" -Runs a mapreduce hourly, which finds all users with that timezone offset, -and sends notifications about recently-aevents to those users -""" def get_time_offset(): + """Calculate timezone offset to target for 4pm local notifications.""" desired_hour = 16 # send new-event notifications at 4pm current_hour = datetime.datetime.now().hour # should be UTC hour offset = desired_hour - current_hour @@ -29,32 +34,7 @@ def get_time_offset(): return float(offset) -@app.route('/tasks/promote_new_events') -class RemindUserMapReduceHandler(base_servlet.BaseTaskRequestHandler): - def get(self): - if self.request.get('offset'): - offset = float(self.request.get('offset')) - else: - offset = get_time_offset() - string_offset = '%+03d00' % offset - logging.info("Got time offset %s for our run", string_offset) - # offset needs to be of type float, or this doesn't work - control.start_map( - name='Send New Events to Users in TZ%s' % string_offset, - reader_spec='mapreduce.input_readers.DatastoreInputReader', - handler_spec='dancedeets.notifications.added_events.promote_events_to_user', - mapper_parameters={ - 'entity_kind': 'dancedeets.users.users.User', - 'filters': [ - ('timezone_offset', '>=', offset), - ('timezone_offset', '<', offset + 1), - ], - }, - shard_count=1, - ) - - -# for development only, usually this will be called via mapreduce +# For development/testing only @app.route('/tasks/promote_new_events_to_user') class RemindUserHandler(base_servlet.BaseTaskRequestHandler): def get(self): @@ -64,12 +44,18 @@ def get(self): def promote_events_to_user(user): + """ + Send push notifications about new events to a user. + + This is the core notification logic used by both: + - Cloud Run Job: dancedeets.jobs.notify_users + - Dev handler: /tasks/promote_new_events_to_user + """ # TODO: Adjust when we have iphone notifications if not android.can_notify(user): return logging.info("Promoting new events to user %s", user.fb_uid) - # Only send notifications for Mike for now user = users.User.get_by_id(user.fb_uid) if not user: logging.error("No user found: %s", user.fb_uid) @@ -89,17 +75,29 @@ def promote_events_to_user(user): if not geocode: return None bounds = math.expand_bounds(geocode.latlng_bounds(), distance_in_km) - query = search_base.SearchQuery(time_period=search_base.TIME_UPCOMING, bounds=bounds, min_attendees=min_attendees) + query = search_base.SearchQuery( + time_period=search_base.TIME_UPCOMING, + bounds=bounds, + min_attendees=min_attendees + ) - one_day_ago = time.mktime((datetime.datetime.now() - datetime.timedelta(hours=24)).timetuple()) + one_day_ago = time.mktime( + (datetime.datetime.now() - datetime.timedelta(hours=24)).timetuple() + ) search_query = search.Search(query) search_query.extra_fields = ['creation_time'] search_results = search_query._get_candidate_doc_events() # TODO: can we move this filter into the search query itself?? - recent_events = [x.doc_id for x in search_results if x.field('creation_time').value > one_day_ago] - - logging.info("Found %s search_results, %s new events", len(search_results), len(recent_events)) + recent_events = [ + x.doc_id for x in search_results + if x.field('creation_time').value > one_day_ago + ] + + logging.info( + "Found %s search_results, %s new events", + len(search_results), len(recent_events) + ) for event_id in recent_events: if android.add_notify(user, event_id): logging.info("Sent notification!") diff --git a/server/dancedeets/pubsub/pubsub_tasks.py b/server/dancedeets/pubsub/pubsub_tasks.py index f922c401..412bf1c4 100644 --- a/server/dancedeets/pubsub/pubsub_tasks.py +++ b/server/dancedeets/pubsub/pubsub_tasks.py @@ -1,5 +1,14 @@ -from dancedeets.compat.mapreduce import control +""" +Social publishing task handlers. +The batch posting of Japan events has been migrated to Cloud Run Jobs. +See: dancedeets.jobs.post_japan_events + +This module retains: +- SocialPublisherHandler: Pulls and publishes events from pubsub queue +- WeeklyEventsPostHandler: Posts weekly events for top US cities +- EventNotificationsHandler: Prepares event reminder notifications +""" import datetime from dancedeets import app @@ -8,7 +17,6 @@ from dancedeets.rankings import cities_db from dancedeets.search import search_base from dancedeets.search import search -from dancedeets.util import dates from . import pubsub @@ -18,36 +26,6 @@ def get(self): pubsub.pull_and_publish_event() -def yield_post_jp_event(db_events): - from dancedeets.compat.mapreduce import context - ctx = context.get() - params = ctx.mapreduce_spec.mapper.params - token_nickname = params.get('token_nickname') - db_events = [x for x in db_events if x.actual_city_name and x.actual_city_name.endswith('Japan')] - for db_event in db_events: - pubsub.eventually_publish_event(db_event.id, token_nickname) - - -@app.route('/tasks/post_japan_events') -class PostJapanEventsHandler(base_servlet.BaseTaskFacebookRequestHandler): - def get(self): - token_nickname = self.request.get('token_nickname', None) - mapper_params = { - 'entity_kind': 'dancedeets.events.eventdata.DBEvent', - 'handle_batch_size': 20, - 'filters': [('search_time_period', '=', dates.TIME_FUTURE)], - 'token_nickname': token_nickname, - } - control.start_map( - name='Post Future Japan Events', - reader_spec='mapreduce.input_readers.DatastoreInputReader', - handler_spec='dancedeets.pubsub.pubsub_tasks.map_post_jp_event', - shard_count=8, # since we want to stick it in the slow-queue, and don't care how fast it executes - queue_name='fast-queue', - mapper_parameters=mapper_params, - ) - - def blacklisted(city): if city.country_name == 'US' and city.state_name == 'NY' and city.city_name in [ 'Brooklyn', 'Borough of Queens', 'Manhattan', 'The Bronx' diff --git a/server/dancedeets/rankings/rankings.py b/server/dancedeets/rankings/rankings.py index 03361797..a4465cd1 100644 --- a/server/dancedeets/rankings/rankings.py +++ b/server/dancedeets/rankings/rankings.py @@ -1,20 +1,20 @@ -import datetime - -from dancedeets.util import memcache +""" +City/country rankings utilities. -# Note: MapReduce is no longer available in App Engine Flexible. -# These imports are kept for reference but the functions won't work. -# Use Cloud Dataflow for batch processing. +The batch ranking computation has been migrated to Cloud Run Jobs. +See: dancedeets.jobs.compute_rankings -from dancedeets.loc import gmaps_api -from . import cities_db +This module retains: +- TIME_PERIODS and constants for display +- retrieve_summary(): Get cached ranking totals +- compute_city_template_rankings(): Format rankings for templates +""" +import datetime -EVENT_FOR_CITY_RANKING = "CITY_EVENT_RANKING" -USER_FOR_CITY_RANKING = "CITY_USER_RANKING" +from dancedeets.util import memcache -# location is a city in cities/state/country -# time_period is one of ALL_TIME, LAST_MONTH, LAST_WEEK +# Time period constants LAST_WEEK = "LAST_WEEK" LAST_MONTH = "LAST_MONTH" ALL_TIME = "ALL_TIME" @@ -33,6 +33,7 @@ def get_time_periods(timestamp): + """Get applicable time periods for a given timestamp.""" if timestamp > datetime.datetime.now() - datetime.timedelta(days=7): yield LAST_WEEK if timestamp > datetime.datetime.now() - datetime.timedelta(days=31): @@ -40,168 +41,29 @@ def get_time_periods(timestamp): yield ALL_TIME -def make_key_name(key_name, **kwargs): - return "%s/%s" % ( - key_name, - "/".join("%s=%s" % (k, v) for (k, v) in sorted(kwargs.items())), - ) - - -def count_event_for_city(dbevent): - if not dbevent.start_time: # deleted event, don't count - return - if not dbevent.latitude or not dbevent.longitude: # no-location event, don't count - return - city = dbevent.city_name - for time_period in get_time_periods(dbevent.creation_time or dbevent.start_time): - yield op.counters.Increment( - make_key_name("City", city=city, time_period=time_period) - ) - yield op.counters.Increment( - make_key_name("Country", country=dbevent.country, time_period=time_period) - ) - - -def count_user_for_city(user): - user_city = user.city_name - for time_period in get_time_periods(user.creation_time): - yield op.counters.Increment( - make_key_name("City", city=user_city, time_period=time_period) - ) - - -def begin_event_ranking_calculations(vertical): - filters = [("verticals", "=", vertical)] - - control.start_map( - name="Compute City Rankings by %s Events" % vertical, - reader_spec="mapreduce.input_readers.DatastoreInputReader", - handler_spec="dancedeets.rankings.rankings.count_event_for_city", - mapper_parameters={ - "entity_kind": "dancedeets.events.eventdata.DBEvent", - "filters": filters, - }, - queue_name="fast-queue", - shard_count=16, - _app=_get_app_id(EVENT_FOR_CITY_RANKING, vertical), - ) - _compute_summary(expiry=5 * 60) # 5 minutes - - -def begin_user_ranking_calculations(): - control.start_map( - name="Compute City Rankings by Users", - reader_spec="mapreduce.input_readers.DatastoreInputReader", - handler_spec="dancedeets.rankings.rankings.count_user_for_city", - mapper_parameters={"entity_kind": "dancedeets.users.users.User"}, - queue_name="fast-queue", - shard_count=16, - _app=USER_FOR_CITY_RANKING, - ) - _compute_summary(expiry=5 * 60) # 5 minutes - - TOTALS_KEY = "StatTotals" TOTALS_EXPIRY = 6 * 3600 def retrieve_summary(): + """ + Retrieve cached ranking summary. + + Returns cached totals or empty dict if not available. + Rankings are computed by the Cloud Run Job: dancedeets.jobs.compute_rankings + """ totals = memcache.get(TOTALS_KEY) if not totals: - totals = _compute_summary() - return totals - - -def _get_app_id(app_name, vertical): - return "%s:%s" % (app_name, vertical) - - -def _compute_summary(expiry=TOTALS_EXPIRY): - # TODO: make this handle non-street events better - vertical = "STREET" - - # IN PROGRESS - event_rankings = get_city_by_event_rankings(vertical) - if event_rankings: - total_events = _compute_sum(event_rankings, ALL_TIME) - else: - total_events = 0 - user_rankings = get_city_by_user_rankings() - if user_rankings: - total_users = _compute_sum(user_rankings, ALL_TIME) - else: - total_users = 0 - - # save - totals = dict(total_events=total_events, total_users=total_users) - memcache.set(TOTALS_KEY, totals, expiry) - + # Rankings not yet computed - return empty totals + totals = dict(total_events=0, total_users=0) return totals -def _parse_key_name(full_key_name): - if "/" not in full_key_name: - return None, {} - key_name, kwargs_string = full_key_name.split("/", 1) - try: - kwargs = dict(kv.split("=") for kv in kwargs_string.split("/")) - except ValueError: - return None, {} - return key_name, kwargs - - -def _get_counter_map_for_ranking(ranking): - # MapReduce is not available in App Engine Flexible Environment. - # This function would have queried mapreduce.model.MapreduceState, - # but that API is not available. Return None to indicate no rankings data. - # TODO: Implement using Cloud Dataflow or BigQuery for batch processing. - return None - - -def _group_cities_time_period(final_counter_map): - cities = {} - for k, counter in final_counter_map.items(): - prefix, kwargs = _parse_key_name(k) - if prefix != "City": - continue - cities.setdefault(kwargs["city"], {})[kwargs["time_period"]] = counter - return cities - - -def _group_users_time_period(final_counter_map, city): - users = {} - for k, counter in final_counter_map.items(): - prefix, kwargs = _parse_key_name(k) - if prefix != "User": - continue - if city and kwargs["city"] != city: - continue - users.setdefault(kwargs["user"], {})[kwargs["time_period"]] = counter - return users - - -def get_city_by_event_rankings(vertical): - final_counter_map = _get_counter_map_for_ranking( - _get_app_id(EVENT_FOR_CITY_RANKING, vertical) - ) - if not final_counter_map: - return {} - cities = _group_cities_time_period(final_counter_map) - return cities - - -def get_city_by_user_rankings(): - final_counter_map = _get_counter_map_for_ranking(USER_FOR_CITY_RANKING) - if not final_counter_map: - return {} - cities = _group_cities_time_period(final_counter_map) - return cities - - def _compute_sum(all_rankings, time_period): + """Compute total count across all cities for a time period.""" total_count = 0 for city, times in all_rankings.items(): - count = times.get(time_period, {}) + count = times.get(time_period, 0) total_count += count return total_count @@ -209,11 +71,23 @@ def _compute_sum(all_rankings, time_period): def compute_city_template_rankings( all_rankings, time_period, vertical=None, use_url=True ): + """ + Format city rankings for template display. + + Args: + all_rankings: Dict of city -> time_period -> count + time_period: Which time period to display + vertical: Event vertical for admin URLs + use_url: Whether to include URLs in output + + Returns: + List of dicts with city, count, and url + """ city_ranking = [] for city, times in all_rankings.items(): if city == "Unknown": continue - count = times.get(time_period, {}) + count = times.get(time_period, 0) if count: if use_url == "ADMIN": url = "/tools/recent_events?vertical=%s&city=%s" % (vertical, city) diff --git a/server/dancedeets/search/email_events.py b/server/dancedeets/search/email_events.py index 7381ec0d..a7dec60d 100644 --- a/server/dancedeets/search/email_events.py +++ b/server/dancedeets/search/email_events.py @@ -1,3 +1,14 @@ +""" +Weekly email functionality. + +The batch sending has been migrated to Cloud Run Jobs. +See: dancedeets.jobs.send_weekly_emails + +This module retains: +- email_for_user: Core function to generate and send email for a user +- yield_email_user: Wrapper that handles FB token and error handling +- DisplayEmailHandler: Admin tool to preview weekly emails +""" import datetime import logging import random @@ -13,7 +24,6 @@ from dancedeets.logic import mobile from dancedeets.mail import mandrill_api from dancedeets.users import users -from dancedeets.util import fb_mapreduce from . import search_base from . import search @@ -178,17 +188,3 @@ def yield_email_user(fbl, user): except Exception as e: logging.exception("Error sending email for user %s", user.fb_uid) return None - - -map_email_user = fb_mapreduce.mr_user_wrap(yield_email_user) -email_user = fb_mapreduce.nomr_wrap(yield_email_user) - - -def mr_email_user(fbl): - fb_mapreduce.start_map( - fbl=fbl, - name='Email Users', - #TODO: MOVE - handler_spec='dancedeets.search.email_events.map_email_user', - entity_kind='dancedeets.users.users.User', - ) diff --git a/server/dancedeets/sitemaps/events.py b/server/dancedeets/sitemaps/events.py index c0eb0e31..2bfe2921 100644 --- a/server/dancedeets/sitemaps/events.py +++ b/server/dancedeets/sitemaps/events.py @@ -1,108 +1,76 @@ +""" +Sitemap generation utilities. + +The main batch processing has been migrated to Cloud Run Jobs. +See: dancedeets.jobs.generate_sitemaps + +This module retains the sitemap entry generation helper for use by the job. +""" import datetime from lxml import etree import logging -# local -from dancedeets import app -from dancedeets import base_servlet -from dancedeets.util import fb_mapreduce from dancedeets.util import urls -def yield_sitemap_event(fbl, all_events): - # Don't really need fbl, but makes everything easier - - for event in all_events: - if not event.has_content(): - continue - - url_node = etree.Element('url') - loc_node = etree.Element('loc') - loc_node.text = urls.dd_event_url(event) - if event.is_fb_event: - if 'updated_time' in event.fb_event['info']: - lastmod_node = etree.Element('lastmod') - updated = event.fb_event['info']['updated_time'] - updated = updated.replace('+0000', '+00:00') - lastmod_node.text = updated - url_node.append(lastmod_node) - else: - logging.info('Event %s does not have updated_time: %s' % (event.id, event.fb_event)) - changefreq_node = etree.Element('changefreq') - priority_node = etree.Element('priority') - - if event.end_time: - end_time = event.end_time - else: - end_time = event.start_time + datetime.timedelta(hours=2) - - start_time_delta = event.start_time - datetime.datetime.now() - end_time_delta = end_time - datetime.datetime.now() - event_delta = end_time - event.start_time - - priority_node.text = '0.5' - - # Event is active and not a multi-week event: - if event_delta.days < 7 and start_time_delta.days <= 1 and end_time_delta.days >= 0: - changefreq_node.text = 'hourly' - - # If it ended awhile ago - elif end_time_delta.days < -30: - changefreq_node.text = 'yearly' - priority_node.text = '0.1' - elif end_time_delta.days < -10: - changefreq_node.text = 'weekly' +def generate_sitemap_entry(event): + """ + Generate a sitemap XML entry for a single event. - # If it's coming up soon - elif start_time_delta.days < 30: - changefreq_node.text = 'daily' + Args: + event: DBEvent instance - else: - changefreq_node.text = 'weekly' - - url_node.append(loc_node) - url_node.append(changefreq_node) - url_node.append(priority_node) - # prints out as one line - yield '%s\n' % etree.tostring(url_node) - - -map_sitemap_event = fb_mapreduce.mr_wrap(yield_sitemap_event) -sitemap_event = fb_mapreduce.nomr_wrap(yield_sitemap_event) + Returns: + XML string for the URL entry, or None if event should be skipped + """ + if not event.has_content(): + return None + url_node = etree.Element('url') + loc_node = etree.Element('loc') + loc_node.text = urls.dd_event_url(event) -@app.route('/tasks/generate_sitemaps') -class ReloadEventsHandler(base_servlet.BaseTaskFacebookRequestHandler): - def get(self): - queue = self.request.get('queue', 'fast-queue') - time_period = self.request.get('time_period', None) - vertical = self.request.get('vertical', None) - - filters = [] - if vertical: - filters.append(('verticals', '=', vertical)) - vertical_string = '%s ' % vertical - else: - vertical_string = '' - - if time_period: - filters.append(('search_time_period', '=', time_period)) - name = 'Generate %s %sSitemaps' % (time_period, vertical_string) + if event.is_fb_event: + if 'updated_time' in event.fb_event.get('info', {}): + lastmod_node = etree.Element('lastmod') + updated = event.fb_event['info']['updated_time'] + updated = updated.replace('+0000', '+00:00') + lastmod_node.text = updated + url_node.append(lastmod_node) else: - name = 'Generate %sSitemaps' % vertical_string - fb_mapreduce.start_map( - fbl=self.fbl, - name=name, - handler_spec='dancedeets.sitemaps.events.map_sitemap_event', - entity_kind='dancedeets.events.eventdata.DBEvent', - handle_batch_size=20, - filters=filters, - queue=queue, - output_writer_spec='mapreduce.output_writers.GoogleCloudStorageOutputWriter', - output_writer={ - 'mime_type': 'text/plain', - 'bucket_name': 'dancedeets-hrd.appspot.com', - }, - ) - - post = get + logging.debug('Event %s does not have updated_time', event.id) + + changefreq_node = etree.Element('changefreq') + priority_node = etree.Element('priority') + + if event.end_time: + end_time = event.end_time + else: + end_time = event.start_time + datetime.timedelta(hours=2) + + start_time_delta = event.start_time - datetime.datetime.now() + end_time_delta = end_time - datetime.datetime.now() + event_delta = end_time - event.start_time + + priority_node.text = '0.5' + + # Event is active and not a multi-week event: + if event_delta.days < 7 and start_time_delta.days <= 1 and end_time_delta.days >= 0: + changefreq_node.text = 'hourly' + # If it ended awhile ago + elif end_time_delta.days < -30: + changefreq_node.text = 'yearly' + priority_node.text = '0.1' + elif end_time_delta.days < -10: + changefreq_node.text = 'weekly' + # If it's coming up soon + elif start_time_delta.days < 30: + changefreq_node.text = 'daily' + else: + changefreq_node.text = 'weekly' + + url_node.append(loc_node) + url_node.append(changefreq_node) + url_node.append(priority_node) + + return etree.tostring(url_node, encoding='unicode') diff --git a/server/dancedeets/users/user_event_tasks.py b/server/dancedeets/users/user_event_tasks.py index dc094555..b3aa51d4 100644 --- a/server/dancedeets/users/user_event_tasks.py +++ b/server/dancedeets/users/user_event_tasks.py @@ -1,49 +1,58 @@ -from dancedeets.compat.mapreduce import control +""" +User event statistics utilities. -from dancedeets import app -from dancedeets import base_servlet +The batch processing has been migrated to Cloud Run Jobs. +See: dancedeets.jobs.compute_user_stats + +This module retains the core update_user_qualities function for reuse. +""" from dancedeets.events import eventdata from dancedeets.event_scraper import potential_events def update_user_qualities(user): - #STR_ID_MIGRATE - source_potential_events = potential_events.PotentialEvent.gql('WHERE source_ids = :graph_id', graph_id=long(user.fb_uid)).fetch(1000) - added_events = eventdata.DBEvent.get_by_ids([x.fb_event_id for x in source_potential_events]) + """ + Calculate and update user event contribution statistics. + + Counts: + - Auto-added events (via ML classifier) + - Auto-added own events (user is the event owner) + - Hand-added events (manually added by user) + - Hand-added own events (user is both creator and owner) + """ + # STR_ID_MIGRATE + try: + fb_uid_long = int(user.fb_uid) + except (ValueError, TypeError): + fb_uid_long = user.fb_uid + + source_potential_events = potential_events.PotentialEvent.gql( + 'WHERE source_ids = :graph_id', graph_id=fb_uid_long + ).fetch(1000) + + added_events = eventdata.DBEvent.get_by_ids( + [x.fb_event_id for x in source_potential_events] + ) user.num_auto_added_events = len([ - x for x in added_events if x and x.creating_method in [eventdata.CM_AUTO, eventdata.CM_AUTO_ATTENDEE] + x for x in added_events + if x and x.creating_method in [eventdata.CM_AUTO, eventdata.CM_AUTO_ATTENDEE] ]) user.num_auto_added_own_events = len([ x for x in added_events - if x and x.creating_method in [eventdata.CM_AUTO, eventdata.CM_AUTO_ATTENDEE] and x.owner_fb_uid == user.fb_uid + if x and x.creating_method in [eventdata.CM_AUTO, eventdata.CM_AUTO_ATTENDEE] + and x.owner_fb_uid == user.fb_uid ]) - #STR_ID_MIGRATE + # STR_ID_MIGRATE user.num_hand_added_events = len([ - x for x in added_events if x and x.creating_method == eventdata.CM_USER and str(x.creating_fb_uid) == user.fb_uid + x for x in added_events + if x and x.creating_method == eventdata.CM_USER + and str(x.creating_fb_uid) == user.fb_uid ]) - #STR_ID_MIGRATE + # STR_ID_MIGRATE user.num_hand_added_own_events = len([ x for x in added_events - if x and x.creating_method == eventdata.CM_USER and str(x.creating_fb_uid) == user.fb_uid and x.owner_fb_uid == user.fb_uid + if x and x.creating_method == eventdata.CM_USER + and str(x.creating_fb_uid) == user.fb_uid + and x.owner_fb_uid == user.fb_uid ]) - - -def map_compute_user_stats(user): - update_user_qualities(user) - user.put() - - -@app.route('/tasks/recompute_user_stats') -class RecomputeUserStatsHandler(base_servlet.BaseTaskRequestHandler): - def get(self): - control.start_map( - name='Compute User-Event Stats', - reader_spec='mapreduce.input_readers.DatastoreInputReader', - handler_spec='dancedeets.users.user_event_tasks.map_compute_user_stats', - mapper_parameters={'entity_kind': 'dancedeets.users.users.User'}, - queue_name='fast-queue', - shard_count=5, - ) - - post = get diff --git a/server/dancedeets/users/user_tasks.py b/server/dancedeets/users/user_tasks.py index c778a9aa..cf99206a 100644 --- a/server/dancedeets/users/user_tasks.py +++ b/server/dancedeets/users/user_tasks.py @@ -1,14 +1,25 @@ +""" +User management tasks. + +The batch user refresh has been migrated to Cloud Run Jobs. +See: dancedeets.jobs.refresh_users + +This module retains: +- LookupAppFriendUsers: FB API lookup type for friend tracking +- TrackNewUserFriendsHandler: Handler for tracking new user friends +- LoadUserHandler: Handler for loading specific users +- fetch_and_save_fb_user: Core function for FB user refresh +""" import logging from dancedeets import app from dancedeets import base_servlet from dancedeets import fb_api -from dancedeets.mail import mailchimp_api -from dancedeets.util import fb_mapreduce from dancedeets.users import users class LookupAppFriendUsers(fb_api.LookupType): + """FB API lookup type for getting app friends.""" @classmethod def get_lookups(cls, object_id): return [('info', cls.url('%s/friends' % object_id))] @@ -16,6 +27,7 @@ def get_lookups(cls, object_id): @app.route('/tasks/track_newuser_friends') class TrackNewUserFriendsHandler(base_servlet.BaseTaskFacebookRequestHandler): + """Track friends for newly registered users.""" def get(self): key = fb_api.generate_key(LookupAppFriendUsers, self.fb_uid) fb_result = self.fbl.fb.fetch_keys([key]) @@ -30,53 +42,23 @@ def get(self): @app.route('/tasks/load_users') class LoadUserHandler(base_servlet.UserOperationHandler): + """Load specific users from Facebook.""" user_operation = lambda self, fbl, load_users: [load_fb_user(fbl, x) for x in load_users] -@app.route('/tasks/reload_all_users') -class ReloadAllUsersHandler(base_servlet.BaseTaskFacebookRequestHandler): - def get(self): - all_users = self.request.get('all_users', '0') == '1' - if all_users: - filters = [] - else: - filters = [('expired_oauth_token', '=', False)] - # this calls a map function wrapped by mr_user_wrap, so it works correctly on a per-user basis - mailchimp_list_id = mailchimp_api.get_list_id() - fb_mapreduce.start_map( - fbl=self.fbl, - name='Load %sUsers' % ('All ' if all_users else ''), - handler_spec='dancedeets.users.user_tasks.map_load_fb_user', - entity_kind='dancedeets.users.users.User', - filters=filters, - extra_mapper_params={ - 'mailchimp_list_id': mailchimp_list_id, - }, - queue='fast-queue' - ) - - post = get - - -def yield_load_fb_user(fbl, user): - if user.expired_oauth_token: - logging.info('Skipping user %s (%s) due to expired access_token', user.fb_uid, user.full_name) - user.put() - elif not fbl.access_token: - logging.info('Skipping user %s (%s) due to not having an access_token', user.fb_uid, user.full_name) - user.put() - else: - fetch_and_save_fb_user(fbl, user) - # The above function calls user.put(), so no need for: - # users.update_mailchimp(user) - - def fetch_and_save_fb_user(fbl, user): + """ + Fetch user data from Facebook and save to Datastore. + + This is the core function used by both: + - Cloud Run Job: dancedeets.jobs.refresh_users + - LoadUserHandler for individual user loading + """ try: fb_user = fbl.get(fb_api.LookupUser, user.fb_uid) except fb_api.ExpiredOAuthToken as e: logging.info('Auth token now expired, mark as such: %s', e) - user.expired_oauth_token_reason = e.args[0] + user.expired_oauth_token_reason = e.args[0] if e.args else "Unknown" user.expired_oauth_token = True user.put() return @@ -85,5 +67,13 @@ def fetch_and_save_fb_user(fbl, user): user.put() -map_load_fb_user = fb_mapreduce.mr_user_wrap(yield_load_fb_user) -load_fb_user = fb_mapreduce.nomr_wrap(yield_load_fb_user) +def load_fb_user(fbl, user): + """Load and save a single user (wrapper for non-mapreduce context).""" + if user.expired_oauth_token: + logging.info('Skipping user %s (%s) due to expired access_token', user.fb_uid, user.full_name) + user.put() + elif not fbl.access_token: + logging.info('Skipping user %s (%s) due to not having an access_token', user.fb_uid, user.full_name) + user.put() + else: + fetch_and_save_fb_user(fbl, user)