diff --git a/.env.example b/.env.example index 17c0873..06d3c77 100644 --- a/.env.example +++ b/.env.example @@ -1,14 +1,13 @@ # Thinkific-Downloader Environment Configuration # Copy this file to .env and fill in your actual values +# =============================================== +# REQUIRED AUTHENTICATION +# =============================================== + # For downloading all content, use the course link. COURSE_LINK="https://your-thinkific-site.com/api/course_player/v2/courses/your-course-name" -# For selective content downloads, use the JSON file created from Thinki Parser. -# Copy the file to the Thinkifi Downloader root folder. -# Specify the file name below. Ex. COURSE_DATA_FILE="modified-course.json" -COURSE_DATA_FILE="" - # Client date header - Get this from browser Developer Tools Network tab CLIENT_DATE="2025-09-23T07:42:31.512Z" @@ -16,31 +15,69 @@ CLIENT_DATE="2025-09-23T07:42:31.512Z" # IMPORTANT: Keep this secret and never share it! COOKIE_DATA="_thinkific_session=YOUR_SESSION_COOKIE_HERE" +# =============================================== +# BASIC SETTINGS +# =============================================== + # Quality Available: "Original File", "1080p", "720p", "540p", "360p", "224p" # Recommended: "720p" for good quality and reasonable file size VIDEO_DOWNLOAD_QUALITY="720p" -# Set to true to download all available video formats/qualities -# Warning: This will significantly increase download size and time -# ALL_VIDEO_FORMATS=false +# Set download directory (defaults to ./downloads) +# All course content will be downloaded to this directory +OUTPUT_DIR="./downloads" + +# =============================================== +# ENHANCED FEATURES +# =============================================== + +# Number of concurrent downloads (default: 3, recommended: 1-5) +# Higher numbers may trigger rate limiting +CONCURRENT_DOWNLOADS=3 + +# Delay between downloads in seconds (default: 1.0) +# Increase if you encounter rate limiting issues +DOWNLOAD_DELAY=1.0 + +# Number of retry attempts for failed downloads (default: 3) +RETRY_ATTEMPTS=3 + +# Rate limiting in MB/s (default: unlimited) +# Set a value to limit download speed (e.g., RATE_LIMIT_MB_S=5.0) +# RATE_LIMIT_MB_S= + +# File validation after download (default: true) +# Validates file integrity and size +VALIDATE_DOWNLOADS=true + +# Resume partial downloads (default: true) +# Automatically resume interrupted downloads +RESUME_PARTIAL=true + +# Debug mode (default: false) +# Enable detailed logging for troubleshooting +DEBUG=false + +# =============================================== +# ADVANCED SETTINGS +# =============================================== # Set to true to enable ffmpeg presentation merging (requires ffmpeg installed) # This combines multi-part presentations into single video files -# FFMPEG_PRESENTATION_MERGE=false - -# Optional: Set download directory (defaults to ./downloads) -# OUTPUT_DIR="./downloads" +FFMPEG_PRESENTATION_MERGE=false -# Optional: Number of concurrent downloads (default: 2) -# Higher numbers may trigger rate limiting -# CONCURRENT_DOWNLOADS=2 +# =============================================== +# OPTIONAL FEATURES (LEGACY SUPPORT) +# =============================================== -# Optional: Delay between downloads in seconds (default: 1) -# Increase if you encounter rate limiting issues -# DOWNLOAD_DELAY=1 +# For selective content downloads, use the JSON file created from Thinki Parser. +# Copy the file to the Thinkifi Downloader root folder. +# Specify the file name below. Ex. COURSE_DATA_FILE="modified-course.json" +# COURSE_DATA_FILE="" -# Optional: Number of retry attempts for failed downloads (default: 3) -# RETRY_ATTEMPTS=3 +# Set to true to download all available video formats/qualities +# Warning: This will significantly increase download size and time +# ALL_VIDEO_FORMATS=false -# Optional: Log level (DEBUG, INFO, WARNING, ERROR) +# Log level (DEBUG, INFO, WARNING, ERROR) # LOG_LEVEL="INFO" \ No newline at end of file diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..bf14598 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,43 @@ +name: ๐Ÿงช CI + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + test: + name: ๐Ÿงช Basic Tests + runs-on: ubuntu-latest + + steps: + - name: ๐Ÿ—๏ธ Checkout + uses: actions/checkout@v4 + + - name: ๐Ÿ Setup Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - name: ๐Ÿ“ฆ Install dependencies + run: | + pip install -r requirements.txt + + - name: ๐Ÿงช Test imports + run: | + python -c "import thinkific_downloader; print('โœ… Package imports work')" + python -c "from thinkific_downloader.config import Settings; print('โœ… Config works')" + + docker: + name: ๐Ÿณ Docker Build + runs-on: ubuntu-latest + + steps: + - name: ๐Ÿ—๏ธ Checkout + uses: actions/checkout@v4 + + - name: ๐Ÿณ Build Docker image + run: | + docker build -t thinkific-downloader:test . + echo "โœ… Docker image builds successfully" \ No newline at end of file diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..3ebe7c0 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,117 @@ +name: ๐Ÿš€ Release + +on: + push: + tags: + - 'v*.*.*' + +jobs: + release: + name: ๐Ÿ“ฆ Create Release + runs-on: ubuntu-latest + + steps: + - name: ๐Ÿ—๏ธ Checkout + uses: actions/checkout@v4 + + - name: ๐Ÿท๏ธ Get version + id: version + run: echo "tag=${GITHUB_REF#refs/tags/}" >> $GITHUB_OUTPUT + + - name: ๐ŸŽ‰ Create Release + uses: actions/create-release@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + tag_name: ${{ steps.version.outputs.tag }} + release_name: ๐Ÿš€ Release ${{ steps.version.outputs.tag }} + body: | + ## ๐ŸŽ‰ New Release: ${{ steps.version.outputs.tag }} + + ### ๐Ÿš€ Installation Options + + **Docker Hub:** + ```bash + docker pull kvnxo/thinkific-downloader:${{ steps.version.outputs.tag }} + # or + docker pull kvnxo/thinkific-downloader:latest + ``` + + **GitHub Packages:** + ```bash + docker pull ghcr.io/itskavin/thinkific-downloader:${{ steps.version.outputs.tag }} + # or + docker pull ghcr.io/itskavin/thinkific-downloader:latest + ``` + + **Setup and Run:** + ```bash + git clone https://github.com/itskavin/Thinkific-Downloader.git + cd Thinkific-Downloader + cp .env.example .env + # Edit .env with your details + docker-compose up + ``` + + **Python Direct:** + ```bash + git clone https://github.com/itskavin/Thinkific-Downloader.git + cd Thinkific-Downloader + pip install -r requirements.txt + python thinkificdownloader.py + ``` + + ### ๐ŸŽฏ Key Features + - Downloads to `./downloads/` by default + - Docker support for easy setup + - Parallel downloads + - Smart resume functionality + draft: false + prerelease: false + + docker: + name: ๐Ÿณ Build Docker + runs-on: ubuntu-latest + needs: release + if: success() + + steps: + - name: ๐Ÿ—๏ธ Checkout + uses: actions/checkout@v4 + + - name: ๐Ÿท๏ธ Get version + id: version + run: echo "tag=${GITHUB_REF#refs/tags/}" >> $GITHUB_OUTPUT + + - name: ๐Ÿ”‘ Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + + - name: ๐Ÿ”‘ Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: ๐Ÿ”ง Build and Push to Docker Hub + run: | + docker build -t kvnxo/thinkific-downloader:latest . + docker build -t kvnxo/thinkific-downloader:${{ steps.version.outputs.tag }} . + docker push kvnxo/thinkific-downloader:latest + docker push kvnxo/thinkific-downloader:${{ steps.version.outputs.tag }} + + - name: ๐Ÿ“ฆ Build and Push to GitHub Packages + run: | + # Convert repository name to lowercase for GitHub Container Registry + REPO_LOWER=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]') + + # Build and tag for GitHub Container Registry + docker build -t ghcr.io/${REPO_LOWER}:latest . + docker build -t ghcr.io/${REPO_LOWER}:${{ steps.version.outputs.tag }} . + + # Push to GitHub Container Registry + docker push ghcr.io/${REPO_LOWER}:latest + docker push ghcr.io/${REPO_LOWER}:${{ steps.version.outputs.tag }} \ No newline at end of file diff --git a/.gitignore b/.gitignore index 385d88f..e3dadea 100644 --- a/.gitignore +++ b/.gitignore @@ -11,7 +11,6 @@ __pycache__/ build/ develop-eggs/ dist/ -downloads/ eggs/ .eggs/ lib/ @@ -93,4 +92,5 @@ ffmpeg.log # Docker runtime artifacts (keep config files in git) .docker/ docker-volumes/ -*.pid \ No newline at end of file +*.pid +thinkific-launch-accelerator-course-october-2025/.download_status.json.bak diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 78e7212..a0cdb5b 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -62,13 +62,15 @@ graph TB - Retry logic with exponential backoff - Resource management -#### **4. File Validator** (`file_validator.py`) -- **Purpose**: Smart file validation and skip logic + +#### **4. Resume Tracker** (`resume_tracker.py`) +- **Purpose**: Atomic, cross-platform resume and status tracking - **Features**: - - File integrity checking (size, checksums) - - Resume detection and validation - - Download metadata persistence - - Smart skip decisions + - Download status tracking and backup (Windows, Mac, Linux) + - File integrity checking (size, checksums) + - Resume detection and validation + - Download metadata persistence + - Smart skip decisions #### **5. Content Processors** - **Wistia Downloader** (`wistia_downloader.py`): Video processing @@ -179,7 +181,7 @@ graph TD 3. **Course Processing**: API calls โ†’ Content parsing โ†’ Task creation 4. **Download Orchestration**: Task queue โ†’ `download_manager.py` โ†’ Parallel workers 5. **Progress Tracking**: Thread-safe updates โ†’ `progress_manager.py` โ†’ Rich UI -6. **Validation**: File checks โ†’ `file_validator.py` โ†’ Skip decisions +6. **Validation**: File checks โ†’ `resume_tracker.py` โ†’ Skip decisions --- @@ -286,7 +288,7 @@ tests/ โ”œโ”€โ”€ unit/ โ”‚ โ”œโ”€โ”€ test_progress_manager.py โ”‚ โ”œโ”€โ”€ test_download_manager.py -โ”‚ โ”œโ”€โ”€ test_file_validator.py +โ”‚ โ”œโ”€โ”€ test_resume_tracker.py โ”‚ โ””โ”€โ”€ test_enhanced_downloader.py โ”œโ”€โ”€ integration/ โ”‚ โ”œโ”€โ”€ test_full_download.py @@ -339,6 +341,18 @@ class TestProgressManager: # Assert assert file_id in progress_manager.downloads assert download.filename == filename + + def test_resume_tracker_atomic_save(self): + from thinkific_downloader.resume_tracker import ResumeTracker + import tempfile + with tempfile.TemporaryDirectory() as tmpdir: + status_file = Path(tmpdir) / ".download_status.json" + tracker = ResumeTracker(str(status_file)) + tracker.status_data["test"] = {"status": "completed"} + tracker._save_status() + assert status_file.exists() + backup_file = status_file.with_suffix('.json.bak') + assert backup_file.exists() @patch('thinkific_downloader.progress_manager.time.time') def test_calculate_download_speed(self, mock_time, progress_manager): diff --git a/README.md b/README.md index 80785bf..7bf7bf3 100644 --- a/README.md +++ b/README.md @@ -30,13 +30,12 @@ A modern, feature-rich Python utility to download courses from Thinkific platfor - **๐Ÿง  Smart File Validation** - Automatic integrity checking and corruption detection - **โ–ถ๏ธ Resume Downloads** - Intelligent partial download recovery and continuation - **โญ๏ธ Skip Existing Files** - Automatic detection and skipping of completed downloads +- **๐Ÿ’พ Atomic Resume/Backup System** - Cross-platform safe status tracking and backup (Windows, Mac, Linux) ### ๐ŸŽฏ **Progress Monitoring** -``` -๐Ÿ’พ introduction.mp4 โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ 100% 156.2MB โ€ข 12.3MB/s โ€ข Complete -๐Ÿ”„ lesson-02.mp4 โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘ 45% 89.1MB/198.4MB โ€ข 8.7MB/s โ€ข 0:00:12 -โณ lesson-03.pdf โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘ 0% Queued -``` +#### Example Progress UI + +![Progress UI](images/image.png) ### ๐Ÿ”’ **Reliability & Safety** - **๐Ÿ”„ Exponential Retry Logic** - Smart retry with jitter for failed downloads @@ -69,6 +68,7 @@ A modern, feature-rich Python utility to download courses from Thinkific platfor - **Rich Terminal Interface** - Beautiful progress bars and status updates - **Smart File Organization** - Logical folder structure with clean naming - **Resume Support** - Skip existing files, continue interrupted downloads +- **Atomic Resume/Backup** - Status file is always safely backed up and updated, works on Windows, Mac, Linux - **Multiple Quality Options** - Choose video quality (720p, 1080p, etc.) - **Comprehensive Logging** - Debug mode for troubleshooting @@ -77,28 +77,84 @@ A modern, feature-rich Python utility to download courses from Thinkific platfor - **Session Management** - Proper authentication handling - **Error Recovery** - Graceful handling of network issues - **Validation** - File integrity checks and cleanup +- **Atomic Status File** - Download status is always saved safely, with backup, for reliable resume ## ๐ŸŽฏ **Quick Start** +**โš ๏ธ Important**: Always clone or download the project first! The application needs access to the project directory for downloads, configuration files (.env), and proper functionality. + ### **๐Ÿณ Docker (Recommended)** +**Step 1: Get the Project** +```bash +# Clone or download the project +git clone https://github.com/itskavin/Thinkific-Downloader.git +cd Thinkific-Downloader + +# Or download and extract ZIP, then navigate to project directory +``` + +**Step 2: Setup Environment** ```bash +# Create your .env file (see configuration section below) +cp .env.example .env +# Edit .env with your course details +``` + +**Step 3: Run with Docker** +```bash +# Option 1: Docker Hub docker pull kvnxo/thinkific-downloader -docker run -it --rm -v $(pwd)/downloads:/app/downloads kvnxo/thinkific-downloader +docker run -it --rm -v $(pwd)/downloads:/app/downloads --env-file .env kvnxo/thinkific-downloader + +# Option 2: GitHub Packages +docker pull ghcr.io/itskavin/thinkific-downloader +docker run -it --rm -v $(pwd)/downloads:/app/downloads --env-file .env ghcr.io/itskavin/thinkific-downloader + +# Option 3: Docker Compose (recommended) +docker-compose up ``` ### **๐Ÿ Python Direct** ```bash +# Step 1: Clone the project git clone https://github.com/itskavin/Thinkific-Downloader.git cd Thinkific-Downloader + +# Step 2: Install dependencies pip install -r requirements.txt -# Update environment variables in .env or export them directly -python thinkidownloader3.py +# Step 3: Configure and run +# Update environment variables in .env file +python thinkificdownloader.py +``` + +### **๐Ÿ“ฆ Source Code Packages** + +Get the latest source code: + +```bash +# Clone the repository +git clone https://github.com/itskavin/Thinkific-Downloader.git +cd Thinkific-Downloader + +# Setup and run with Docker +cp .env.example .env +# Edit .env with your course details +docker-compose up +# Or run with Python +pip install -r requirements.txt +python thinkificdownloader.py ``` +> **Resume/Backup System:** +> - Download status is tracked in `.download_status.json` (atomic, cross-platform) +> - A backup `.download_status.json.bak` is created automatically before each update +> - If interrupted, simply rerun the downloader to resume from where you left off +> - Works seamlessly on Windows, Mac, and Linux + > ๐Ÿ“– **Need detailed setup instructions?** Check out our comprehensive [**SETUP.md**](SETUP.md) guide for step-by-step installation, troubleshooting, and configuration options. > ๐Ÿ‘จโ€๐Ÿ’ป **Developer?** Visit [**DEVELOPMENT.md**](DEVELOPMENT.md) for architecture overview, API reference, and contribution guidelines. @@ -108,27 +164,36 @@ python thinkidownloader3.py Configure advanced features via environment variables or `.env` file: ```bash -# Required +# =============================================== +# REQUIRED AUTHENTICATION +# =============================================== COURSE_LINK="" # Thinkific course URL COOKIE_DATA="" # Browser cookies for authentication CLIENT_DATE="" # Client date header -# Optional - Performance -VIDEO_DOWNLOAD_QUALITY="Original File" # Video quality (Original File,720p, 1080p, etc.) -CONCURRENT_DOWNLOADS=3 # Number of parallel downloads (1-10 recommended) +# =============================================== +# BASIC SETTINGS +# =============================================== +VIDEO_DOWNLOAD_QUALITY="720p" # Video quality (Original File, 720p, 1080p, etc.) +OUTPUT_DIR="./downloads" # Download directory (defaults to ./downloads) + +# =============================================== +# ENHANCED FEATURES +# =============================================== +CONCURRENT_DOWNLOADS=3 # Number of parallel downloads (1-5 recommended) RETRY_ATTEMPTS=3 # Number of retry attempts for failed downloads -RATE_LIMIT_MB_S=0 # Rate limit in MB/s (0 = unlimited) DOWNLOAD_DELAY=1.0 # Delay between downloads (seconds) +RATE_LIMIT_MB_S= # Rate limit in MB/s (empty = unlimited) -# Optional - Features +# Feature toggles VALIDATE_DOWNLOADS=true # Enable file integrity validation RESUME_PARTIAL=true # Enable resume for partial downloads DEBUG=false # Enable debug logging -# Optional - System -OUTPUT_DIR=./downloads # Download directory +# =============================================== +# ADVANCED SETTINGS +# =============================================== FFMPEG_PRESENTATION_MERGE=false # Enable FFmpeg presentation merging -LOG_LEVEL=INFO # Logging level (DEBUG, INFO, WARNING) ``` ``` @@ -151,24 +216,29 @@ docker-compose up ## ๐Ÿ“ **Output Structure** +**Default Location**: All courses are downloaded to `./downloads/` directory in your project folder. + ``` -๐Ÿ“ Course Name/ -โ”œโ”€โ”€ ๐Ÿ“ 01. Introduction/ -โ”‚ โ”œโ”€โ”€ ๐Ÿ“ 01. Welcome Video/ -โ”‚ โ”‚ โ”œโ”€โ”€ ๐ŸŽฅ welcome-video.mp4 -โ”‚ โ”‚ โ””โ”€โ”€ ๐Ÿ“„ video-info.json -โ”‚ โ””โ”€โ”€ ๐Ÿ“ 02. Course Overview/ -โ”‚ โ”œโ”€โ”€ ๐Ÿ“„ course-overview.html -โ”‚ โ””โ”€โ”€ ๐Ÿ“Š quiz-structure.json -โ”œโ”€โ”€ ๐Ÿ“ 02. Getting Started/ -โ”‚ โ””โ”€โ”€ ๐Ÿ“ 01. Setup Instructions/ -โ”‚ โ”œโ”€โ”€ ๐ŸŽฅ setup-instructions.mp4 -โ”‚ โ”œโ”€โ”€ ๐Ÿ“„ setup-guide.pdf -โ”‚ โ””โ”€โ”€ ๐ŸŽจ presentation-slides.mp4 -โ”œโ”€โ”€ ๐Ÿ“„ course-metadata.json -โ””โ”€โ”€ ๐Ÿ“Š download-summary.json +๐Ÿ“ downloads/ +โ””โ”€โ”€ ๐Ÿ“ Course Name/ + โ”œโ”€โ”€ ๐Ÿ“ 01. Introduction/ + โ”‚ โ”œโ”€โ”€ ๐Ÿ“ 01. Welcome Video/ + โ”‚ โ”‚ โ”œโ”€โ”€ ๐ŸŽฅ welcome-video.mp4 + โ”‚ โ”‚ โ””โ”€โ”€ ๐Ÿ“„ video-info.json + โ”‚ โ””โ”€โ”€ ๐Ÿ“ 02. Course Overview/ + โ”‚ โ”œโ”€โ”€ ๐Ÿ“„ course-overview.html + โ”‚ โ””โ”€โ”€ ๐Ÿ“Š quiz-structure.json + โ”œโ”€โ”€ ๐Ÿ“ 02. Getting Started/ + โ”‚ โ””โ”€โ”€ ๐Ÿ“ 01. Setup Instructions/ + โ”‚ โ”œโ”€โ”€ ๐ŸŽฅ setup-instructions.mp4 + โ”‚ โ”œโ”€โ”€ ๐Ÿ“„ setup-guide.pdf + โ”‚ โ””โ”€โ”€ ๐ŸŽจ presentation-slides.mp4 + โ”œโ”€โ”€ ๐Ÿ“„ course-metadata.json + โ””โ”€โ”€ ๐Ÿ“Š download-summary.json ``` +**Customization**: Set `OUTPUT_DIR=./my-custom-path` in your `.env` file to change the download location. + ### **Supported Content Types** @@ -182,7 +252,22 @@ docker-compose up | **Quizzes** | `.json` | Structure export | Question/answer format | ## โ“ **FAQ** +### **Resume/Backup System** + +**Q: How does resume work?** +- The downloader automatically tracks download status in `.download_status.json`. +- Before updating, a backup `.download_status.json.bak` is created (atomic, safe). +- If interrupted, just rerun the downloader. It will resume partial downloads, skip completed files, and retry failed ones. +- No manual intervention needed. + +**Q: Is it safe on Windows, Mac, Linux?** +- Yes! The resume/backup system uses atomic file operations and works on all major platforms. + +**Q: Where is the status file stored?** +- In the current working directory (where you run the downloader). +**Q: Can I delete the status file?** +- Yes, but you will lose resume progress. The backup file is for safety only. ### **๐Ÿ” Authentication & Setup** **Q: How do I get the required authentication data?** diff --git a/SETUP.md b/SETUP.md index 244afd3..a1318fc 100644 --- a/SETUP.md +++ b/SETUP.md @@ -36,88 +36,38 @@ This comprehensive guide walks you through installing and configuring Thinkific- ## ๐Ÿš€ Installation Methods -### **Method 1: Docker (Recommended - Easiest)** - -Docker provides the most consistent and hassle-free experience with all dependencies pre-installed. - -#### **1.1 Install Docker** -- **Windows/Mac**: Download [Docker Desktop](https://www.docker.com/products/docker-desktop/) -- **Linux**: Follow [Docker installation guide](https://docs.docker.com/engine/install/) - -#### **1.2 Pull and Run** -```bash -# Pull the latest image -docker pull kvnxo/thinkific-downloader:latest - -# Run with basic setup -docker run -it --rm \ - -v $(pwd)/downloads:/app/downloads \ - -e COURSE_LINK="YOUR_COURSE_URL" \ - -e COOKIE_DATA="YOUR_COOKIES" \ - -e CLIENT_DATE="YOUR_CLIENT_DATE" \ - kvnxo/thinkific-downloader:latest -``` - -#### **1.3 Docker Compose (Recommended)** -Create `docker-compose.yml`: -```yaml -version: '3.8' -services: - thinkific-downloader: - image: kvnxo/thinkific-downloader:latest - volumes: - - ./downloads:/app/downloads - - ./.env:/app/.env - environment: - - COURSE_LINK=${COURSE_LINK} - - COOKIE_DATA=${COOKIE_DATA} - - CLIENT_DATE=${CLIENT_DATE} - # Enhanced features - - CONCURRENT_DOWNLOADS=3 - - RETRY_ATTEMPTS=3 -``` - -Run with: `docker-compose up` +### **๐Ÿ“ฆ Option 1: Clone Repository (Recommended)** + +Get the latest version directly from GitHub: + +1. **Clone the repository**: + ```bash + git clone https://github.com/itskavin/Thinkific-Downloader.git + cd Thinkific-Downloader + ``` + +2. **Setup configuration**: + ```bash + cp .env.example .env + # Edit .env with your course details (see Authentication Setup below) + ``` + +3. **Run with Docker** (Recommended): + ```bash + docker-compose up + ``` + + **Or run with Python**: + ```bash + pip install -r requirements.txt + python thinkificdownloader.py + ``` --- -### **Method 2: Python Package Installation** - -For users who prefer native Python installation with full control. - -#### **2.1 Clone Repository** -```bash -# Clone the repository -git clone https://github.com/itskavin/Thinkific-Downloader.git -cd Thinkific-Downloader -``` - -#### **2.2 Quick Setup (Automated)** -```bash -# Run the automated installer -python install.py -``` - -#### **2.3 Manual Installation** -```bash -# Create virtual environment (recommended) -python -m venv venv -source venv/bin/activate # Linux/Mac -# or -venv\Scripts\activate # Windows - -# Install dependencies -pip install -r requirements.txt +### **๐Ÿณ Option 2: Docker Only** -# Install in development mode -pip install -e . -``` - -#### **2.4 Verify Installation** -```bash -# Test the installation -python -m thinkific_downloader --help -``` +If you want to use Docker without cloning: --- @@ -204,6 +154,10 @@ VALIDATE_DOWNLOADS=true # Resume Partial Downloads (true/false) RESUME_PARTIAL=true +# Atomic Resume/Backup System (always enabled) +# Download status is tracked in .download_status.json (atomic, cross-platform) +# A backup .download_status.json.bak is created automatically before each update + # Debug Mode (true/false) DEBUG=false @@ -280,6 +234,15 @@ docker run -it --rm \ ๐Ÿ“š Course: Your Course Name | Progress: 0.0% (0/25 files) | Speed: 0.0 MB/s | ETA: Unknown +๐Ÿ“Š Resume Status Summary + โœ… 5 files already completed + ๐Ÿ“ฅ 2 files partially downloaded (will resume) + โŒ 1 files previously failed (will retry) + +๐Ÿ“ Files to download: 31 +๐Ÿ”„ Parallel workers: 3 +โšก Enhanced features: Rate limiting, Resume, Validation + ๐ŸŽฅ introduction.mp4 โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ 100% 156.2MB โ€ข 12.3MB/s โ€ข Complete ๐Ÿ”„ lesson-02.mp4 โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘ 45% 89.1MB/198.4MB โ€ข 8.7MB/s โ€ข 0:00:12 โณ lesson-03.pdf โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘ 0% Queued @@ -535,6 +498,23 @@ After setup, verify everything works: ```bash # Should show course info without downloading python -c "from thinkific_downloader.config import Settings; s=Settings.from_env(); print(f'โœ… Auth OK for {s.client_date[:20]}...')" + +## Resume/Backup System + +**How does resume work?** +- The downloader automatically tracks download status in `.download_status.json`. +- Before updating, a backup `.download_status.json.bak` is created (atomic, safe). +- If interrupted, just rerun the downloader. It will resume partial downloads, skip completed files, and retry failed ones. +- No manual intervention needed. + +**Is it safe on Windows, Mac, Linux?** +- Yes! The resume/backup system uses atomic file operations and works on all major platforms. + +**Where is the status file stored?** +- In the current working directory (where you run the downloader). + +**Can I delete the status file?** +- Yes, but you will lose resume progress. The backup file is for safety only. ``` ### **2. Test Network Connection** diff --git a/images/image.png b/images/image.png new file mode 100644 index 0000000..37cb897 Binary files /dev/null and b/images/image.png differ diff --git a/requirements.txt b/requirements.txt index cc647e0..cef341a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ tqdm>=4.65.0 +requests>=2.28.0 rich>=13.0.0 \ No newline at end of file diff --git a/setup.py b/setup.py index 15ca50e..6416b43 100644 --- a/setup.py +++ b/setup.py @@ -28,9 +28,16 @@ packages=find_packages(), python_requires=">=3.8", install_requires=[ + "requests>=2.31.0", + "rich>=13.0.0", "tqdm>=4.65.0", + "urllib3>=2.0.0", ], extras_require={ + "enhanced": [ + "beautifulsoup4>=4.12.0", + "lxml>=4.9.0", + ], "brotli": ["brotli>=1.0.9"], }, entry_points={ diff --git a/thinkific_downloader/__init__.py b/thinkific_downloader/__init__.py index 7254b04..ee6f1b4 100644 --- a/thinkific_downloader/__init__.py +++ b/thinkific_downloader/__init__.py @@ -11,7 +11,7 @@ Features: - Modern Python package architecture -- Rich terminal UI with progress tracking +- progress tracking - Smart retry logic and error recovery - Resume support for interrupted downloads - Docker containerization with FFmpeg diff --git a/thinkific_downloader/config.py b/thinkific_downloader/config.py index adf3bb0..5b98829 100644 --- a/thinkific_downloader/config.py +++ b/thinkific_downloader/config.py @@ -27,6 +27,7 @@ class Settings: cookie_data: str video_download_quality: str = '720p' ffmpeg_presentation_merge: bool = False + output_dir: str = './downloads' # Default to downloads directory # Enhanced downloader settings concurrent_downloads: int = 3 retry_attempts: int = 3 @@ -40,23 +41,37 @@ class Settings: @classmethod def from_env(cls): load_env() + + # Required authentication client_date = os.getenv('CLIENT_DATE', '') cookie_data = os.getenv('COOKIE_DATA', '') + + # Basic settings with matching defaults to .env.example video_download_quality = os.getenv('VIDEO_DOWNLOAD_QUALITY', '720p') + output_dir = os.getenv('OUTPUT_DIR', './downloads') + + # Advanced settings ffmpeg_flag_raw = os.getenv('FFMPEG_PRESENTATION_MERGE', 'false').lower() ffmpeg_merge = ffmpeg_flag_raw in ('1', 'true', 'yes', 'on') - # Enhanced settings + # Enhanced downloader settings with matching defaults concurrent_downloads = int(os.getenv('CONCURRENT_DOWNLOADS', '3')) retry_attempts = int(os.getenv('RETRY_ATTEMPTS', '3')) - rate_limit_mb_s = float(os.getenv('RATE_LIMIT_MB_S', '0')) or None download_delay = float(os.getenv('DOWNLOAD_DELAY', '1.0')) + + # Rate limiting - empty string or 0 means unlimited + rate_limit_env = os.getenv('RATE_LIMIT_MB_S', '') + rate_limit_mb_s = float(rate_limit_env) if rate_limit_env and rate_limit_env != '0' else None + + # Feature toggles validate_downloads = os.getenv('VALIDATE_DOWNLOADS', 'true').lower() in ('1', 'true', 'yes', 'on') resume_partial = os.getenv('RESUME_PARTIAL', 'true').lower() in ('1', 'true', 'yes', 'on') debug = os.getenv('DEBUG', 'false').lower() in ('1', 'true', 'yes', 'on') + # Validation if not client_date or not cookie_data: raise SystemExit('Cookie data and Client Date not set. Use the ReadMe file first before using this script.') + # Basic directory permissions check cwd = Path.cwd() if not os.access(cwd, os.W_OK): @@ -64,7 +79,8 @@ def from_env(cls): return cls( client_date=client_date, cookie_data=cookie_data, - video_download_quality=video_download_quality, + video_download_quality=video_download_quality, + output_dir=output_dir, ffmpeg_presentation_merge=ffmpeg_merge, concurrent_downloads=concurrent_downloads, retry_attempts=retry_attempts, diff --git a/thinkific_downloader/download_manager.py b/thinkific_downloader/download_manager.py new file mode 100644 index 0000000..3ea0b9c --- /dev/null +++ b/thinkific_downloader/download_manager.py @@ -0,0 +1,542 @@ +import os +import time +import hashlib +import threading +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path +from typing import Optional, Dict, Any, Callable, List +from urllib.parse import urlparse +import requests +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry +from rich.progress import Progress, TaskID, TextColumn, BarColumn, TimeRemainingColumn, TransferSpeedColumn, DownloadColumn +from rich.text import Text +from rich.progress import ProgressColumn + +class QueuedSpeedColumn(ProgressColumn): + """Speed column that shows 'Queued' instead of unrealistic speeds""" + def render(self, task): + # Try to get Rich's calculated speed + try: + # Rich Progress stores speed in task.speed as bytes per second + speed = task.speed + except: + speed = None + + if speed is None or speed <= 0: + return Text("Queued", style="dim") + + # Convert bytes/sec to readable format + if speed >= 1024 * 1024: # >= 1 MB/s + speed_display = speed / (1024 * 1024) + return Text(f"{speed_display:.1f} MB/s", style="green") + elif speed >= 1024: # >= 1 KB/s + speed_display = speed / 1024 + return Text(f"{speed_display:.1f} KB/s", style="green") + else: + return Text(f"{speed:.0f} B/s", style="green") + +class QueuedTimeColumn(ProgressColumn): + """Time remaining column that shows 'Queued' for pending downloads""" + def render(self, task): + try: + # Get Rich's calculated time remaining + time_remaining = task.time_remaining + except: + time_remaining = None + + if time_remaining is None or time_remaining <= 0: + return Text("Queued", style="dim") + + # Handle very long estimates (likely unrealistic) + if time_remaining > 86400: # More than 24 hours + return Text("Long time", style="yellow") + + remaining = int(time_remaining) + hours = remaining // 3600 + minutes = (remaining % 3600) // 60 + seconds = remaining % 60 + + if hours > 0: + return Text(f"{hours:02d}:{minutes:02d}:{seconds:02d}", style="cyan") + else: + return Text(f"{minutes:02d}:{seconds:02d}", style="cyan") +from rich.console import Console +from .config import Settings +from .file_utils import filter_filename + + +class RateLimiter: + """Token bucket rate limiter for controlling download speed.""" + + def __init__(self, rate_limit_mb_s: Optional[float] = None): + self.rate_limit_bytes_s = rate_limit_mb_s * 1024 * 1024 if rate_limit_mb_s else None + self.tokens = 0.0 + self.last_update = time.time() + self.lock = threading.Lock() + + def acquire(self, size: int) -> float: + """Acquire tokens for the given size. Returns sleep time if rate limited.""" + if not self.rate_limit_bytes_s: + return 0.0 + + with self.lock: + now = time.time() + time_passed = now - self.last_update + self.tokens += time_passed * self.rate_limit_bytes_s + self.tokens = min(self.tokens, self.rate_limit_bytes_s) # Cap at burst rate + self.last_update = now + + if self.tokens >= size: + self.tokens -= size + return 0.0 + else: + # Calculate wait time + wait_time = (size - self.tokens) / self.rate_limit_bytes_s + self.tokens = 0.0 + self.last_update = now + wait_time + return wait_time + + +class DownloadSession: + """Manages HTTP sessions with connection pooling and retry logic.""" + + def __init__(self, settings: Settings): + self.settings = settings + self.session = self._create_session() + + def _create_session(self) -> requests.Session: + """Create a requests session with proper configuration.""" + session = requests.Session() + + # Configure retry strategy + retry_strategy = Retry( + total=self.settings.retry_attempts, + backoff_factor=1, + status_forcelist=[429, 500, 502, 503, 504], + ) + + # Create adapter with connection pooling + adapter = HTTPAdapter( + max_retries=retry_strategy, + pool_connections=10, + pool_maxsize=20 + ) + + session.mount("http://", adapter) + session.mount("https://", adapter) + + # Set default headers + session.headers.update({ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Accept': 'application/json,text/javascript,*/*;q=0.9', + 'Accept-Encoding': 'gzip, deflate, br', + 'Sec-Fetch-Mode': 'cors', + 'Sec-Fetch-Site': 'cross-site', + 'x-requested-with': 'XMLHttpRequest', + 'x-thinkific-client-date': self.settings.client_date, + 'cookie': self.settings.cookie_data, + }) + + return session + + def get(self, url: str, **kwargs) -> requests.Response: + """Make a GET request with the session.""" + return self.session.get(url, timeout=60, **kwargs) + + def close(self): + """Close the session.""" + self.session.close() + + +class FileValidator: + """Handles file validation and integrity checks.""" + + @staticmethod + def calculate_checksum(file_path: Path, algorithm: str = 'md5') -> str: + """Calculate file checksum.""" + hash_func = hashlib.new(algorithm) + with open(file_path, 'rb') as f: + for chunk in iter(lambda: f.read(8192), b""): + hash_func.update(chunk) + return hash_func.hexdigest() + + @staticmethod + def validate_file_size(file_path: Path, expected_size: Optional[int] = None) -> bool: + """Validate file size if expected size is provided.""" + if expected_size is None: + return True + return file_path.stat().st_size == expected_size + + @staticmethod + def is_file_complete(file_path: Path, expected_size: Optional[int] = None) -> bool: + """Check if file appears to be complete.""" + if not file_path.exists(): + return False + if expected_size is None: + return True + return file_path.stat().st_size == expected_size + + +class DownloadTask: + """Represents a single download task.""" + + def __init__(self, url: str, dest_path: Path, expected_size: Optional[int] = None, + checksum: Optional[str] = None, resume: bool = True): + self.url = url + self.dest_path = dest_path + self.expected_size = expected_size + self.checksum = checksum + self.resume = resume + self.downloaded_size = 0 + self.status = 'pending' + self.error: Optional[str] = None + + def is_complete(self) -> bool: + """Check if download is complete.""" + if not self.dest_path.exists(): + return False + if self.expected_size: + return self.dest_path.stat().st_size == self.expected_size + return True + + +class DownloadManager: + """Manages parallel downloads with rate limiting, retries, and validation.""" + + def __init__(self, settings: Settings): + self.settings = settings + self.session = DownloadSession(settings) + self.rate_limiter = RateLimiter(settings.rate_limit_mb_s) + self.executor = ThreadPoolExecutor(max_workers=settings.concurrent_downloads) + self.validator = FileValidator() + self.active_downloads: Dict[str, DownloadTask] = {} + self.lock = threading.Lock() + + def download_file(self, url: str, dest_path: Path, expected_size: Optional[int] = None, + checksum: Optional[str] = None, show_progress: bool = True) -> bool: + """Download a single file with all features enabled.""" + task = DownloadTask(url, dest_path, expected_size, checksum, self.settings.resume_partial) + + # Check if file already exists and is valid + if task.is_complete() and self._validate_download(task): + if self.settings.debug: + print(f"File already exists and valid: {dest_path}") + return True + + # Start download + return self._download_single_file(task, show_progress) + + def download_files_parallel(self, tasks: List[DownloadTask], + progress_callback: Optional[Callable] = None) -> List[bool]: + """Download multiple files in parallel with Rich progress display.""" + + console = Console() + + # Create rich progress display + progress = Progress( + TextColumn("[bold blue]{task.fields[filename]}", justify="right"), + BarColumn(bar_width=40), + "[progress.percentage]{task.percentage:>3.1f}%", + "โ€ข", + DownloadColumn(), + "โ€ข", + TransferSpeedColumn(), + "โ€ข", + TimeRemainingColumn(), + console=console, + ) + + with progress: + futures = [] + results = [] + task_progress_map = {} + + for task in tasks: + # Check if already complete + if task.is_complete() and self._validate_download(task): + results.append(True) + continue + + # Get expected size + if task.expected_size is None: + task.expected_size = self._get_content_length(task.url) + + # Add progress task + progress_task_id = progress.add_task( + "download", + filename=task.dest_path.name, + total=task.expected_size or 100 + ) + task_progress_map[id(task)] = progress_task_id + + # Submit download job + future = self.executor.submit(self._download_with_rich_progress, task, progress, progress_task_id) + futures.append((future, task, progress_task_id)) + + # Wait for completion + for future, task, progress_task_id in futures: + try: + result = future.result() + results.append(result) + if progress_callback: + progress_callback(task, result) + except Exception as e: + console.print(f"[red]Download failed for {task.dest_path}: {e}[/red]") + results.append(False) + + return results + + def _download_with_rich_progress(self, task: DownloadTask, progress, progress_task_id: int) -> bool: + """Download a single file with Rich progress bar updates.""" + try: + # Check for resume + resume_pos = 0 + if task.resume and task.dest_path.exists(): + resume_pos = task.dest_path.stat().st_size + if task.expected_size and resume_pos >= task.expected_size: + return self._validate_download(task) + + # Prepare headers for resume + headers = {} + if task.resume and resume_pos > 0: + headers['Range'] = f'bytes={resume_pos}-' + + response = self.session.session.get(task.url, headers=headers, stream=True) + response.raise_for_status() + + # Update progress bar with actual content length + content_length = response.headers.get('Content-Length') + if content_length: + total_size = int(content_length) + resume_pos + if task.expected_size != total_size: + task.expected_size = total_size + progress.update(progress_task_id, total=total_size) + + mode = 'ab' if resume_pos > 0 else 'wb' + downloaded = resume_pos + + with open(task.dest_path, mode) as f: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + # Rate limiting + sleep_time = self.rate_limiter.acquire(len(chunk)) + if sleep_time > 0: + time.sleep(sleep_time) + + f.write(chunk) + downloaded += len(chunk) + + # Update Rich progress bar + progress.update(progress_task_id, advance=len(chunk)) + + # Download completed successfully + task.status = 'completed' + return self._validate_download(task) + + except Exception as e: + task.status = 'failed' + task.error = str(e) + + # Clean up partial file if not resuming + if not task.resume and task.dest_path.exists(): + task.dest_path.unlink() + + return False + + def _download_single_file(self, task: DownloadTask, show_progress: bool = True) -> bool: + """Download a single file with resume support.""" + try: + # Get file size first + if task.expected_size is None: + task.expected_size = self._get_content_length(task.url) + + # Check for resume + resume_pos = 0 + if task.resume and task.dest_path.exists(): + resume_pos = task.dest_path.stat().st_size + if task.expected_size and resume_pos >= task.expected_size: + return self._validate_download(task) + + # Prepare headers for resume + headers = {} + if task.resume and resume_pos > 0: + headers['Range'] = f'bytes={resume_pos}-' + + # Make request + response = self.session.get(task.url, headers=headers, stream=True) + response.raise_for_status() + + # Handle redirect + if response.status_code == 302: + redirect_url = response.headers.get('Location') + if redirect_url: + response = self.session.get(redirect_url, headers=headers, stream=True) + response.raise_for_status() + + # Get actual content length + content_length = response.headers.get('Content-Length') + if content_length: + total_size = int(content_length) + resume_pos + if task.expected_size is None: + task.expected_size = total_size + + mode = 'ab' if resume_pos > 0 else 'wb' + downloaded = resume_pos + + # Progress bar + if show_progress and task.expected_size: + console = Console() + console.print(f"[blue]Downloading {task.dest_path.name}...[/blue]") + + with open(task.dest_path, mode) as f: + start_time = time.time() + for chunk in response.iter_content(chunk_size=8192): + if chunk: + # Rate limiting + sleep_time = self.rate_limiter.acquire(len(chunk)) + if sleep_time > 0: + time.sleep(sleep_time) + + f.write(chunk) + downloaded += len(chunk) + + # Only show speed updates if not in parallel mode (to avoid spam) + if downloaded % (1024 * 1024) == 0 and show_progress: # Update every 1MB + elapsed = time.time() - start_time + if elapsed > 0: + speed = downloaded / elapsed + # Only print speed updates when show_progress is True (not in parallel mode) + pass # Remove speed updates in parallel mode + + # Download completed successfully + + # Validate download + task.status = 'completed' + return self._validate_download(task) + + except Exception as e: + task.status = 'failed' + task.error = str(e) + print(f"Download failed for {task.dest_path}: {e}") + + # Clean up partial file if not resuming + if not task.resume and task.dest_path.exists(): + task.dest_path.unlink() + + return False + + def _get_content_length(self, url: str) -> Optional[int]: + """Get content length from HEAD request.""" + try: + response = self.session.session.head(url, timeout=30) + response.raise_for_status() + content_length = response.headers.get('Content-Length') + return int(content_length) if content_length else None + except: + return None + + def _validate_download(self, task: DownloadTask) -> bool: + """Validate downloaded file with comprehensive checks.""" + if not task.dest_path.exists(): + print(f"โŒ File missing: {task.dest_path.name}") + return False + + try: + file_size = task.dest_path.stat().st_size + + # Check if file is empty or too small + if file_size == 0: + print(f"โŒ Empty file detected: {task.dest_path.name}") + task.dest_path.unlink() # Remove empty file + return False + + # For video/audio files, check if they're complete and valid + if task.dest_path.suffix.lower() in ['.mp4', '.mp3', '.wav', '.m4a']: + if not self._validate_media_file(task.dest_path, file_size): + return False + + # Check expected size if available + if task.expected_size and task.expected_size > 0: + size_ratio = file_size / task.expected_size + + # File should be at least 90% of expected size + if size_ratio < 0.9: + print(f"โŒ Incomplete download: {task.dest_path.name} ({file_size:,} bytes, expected {task.expected_size:,})") + return False + + # File shouldn't be more than 110% of expected size (accounting for small variations) + if size_ratio > 1.1: + print(f"โš ๏ธ File larger than expected: {task.dest_path.name} ({file_size:,} bytes, expected {task.expected_size:,})") + # Don't fail for this case, might be normal + + # Additional validation for specific file types + if not self._validate_file_integrity(task.dest_path): + return False + + print(f"โœ… Validated: {task.dest_path.name} ({file_size:,} bytes)") + return True + + except Exception as e: + print(f"โŒ Validation error for {task.dest_path.name}: {e}") + return False + + def _validate_media_file(self, file_path: Path, file_size: int) -> bool: + """Validate media files (MP4, MP3, etc.) for corruption.""" + try: + # Check for minimum file size (media files should be at least a few KB) + if file_size < 1024: # Less than 1KB is suspicious for media + print(f"โŒ Media file too small: {file_path.name} ({file_size} bytes)") + file_path.unlink() # Remove corrupted file + return False + + # Read first and last few bytes to check file structure + with open(file_path, 'rb') as f: + # Check beginning of file for media headers + header = f.read(16) + + # MP4 files should start with specific signatures + if file_path.suffix.lower() == '.mp4': + # Check for common MP4 signatures + if not (b'ftyp' in header or b'mdat' in header[:8]): + print(f"โŒ Invalid MP4 header: {file_path.name}") + file_path.unlink() + return False + + # Check if we can read the end of file (indicates complete download) + try: + f.seek(-min(1024, file_size), 2) # Go to last 1KB or file size + f.read(1024) + except: + print(f"โŒ Cannot read end of file: {file_path.name}") + file_path.unlink() + return False + + return True + + except Exception as e: + print(f"โŒ Media validation failed for {file_path.name}: {e}") + if file_path.exists(): + file_path.unlink() # Remove corrupted file + return False + + def _validate_file_integrity(self, file_path: Path) -> bool: + """Basic file integrity checks.""" + try: + # Try to read the file completely + with open(file_path, 'rb') as f: + chunk_size = 8192 + while chunk := f.read(chunk_size): + pass # Just reading to ensure file is accessible + return True + + except Exception as e: + print(f"โŒ File integrity check failed for {file_path.name}: {e}") + if file_path.exists(): + file_path.unlink() # Remove corrupted file + return False + + def close(self): + """Clean up resources.""" + self.session.close() + self.executor.shutdown(wait=True) \ No newline at end of file diff --git a/thinkific_downloader/downloader.py b/thinkific_downloader/downloader.py index 0ab5711..5a1175f 100644 --- a/thinkific_downloader/downloader.py +++ b/thinkific_downloader/downloader.py @@ -11,6 +11,8 @@ from .config import Settings, load_env from .file_utils import filter_filename, unicode_decode +from .download_manager import DownloadManager, DownloadTask +from .progress_manager import print_banner, print_download_start_banner, print_completion_summary, ContentProcessor from tqdm import tqdm # Globals to mirror PHP behavior @@ -18,17 +20,27 @@ COURSE_CONTENTS: List[Dict[str, Any]] = [] SETTINGS: Optional[Settings] = None BASE_HOST: Optional[str] = None +DOWNLOAD_MANAGER: Optional[DownloadManager] = None +DOWNLOAD_TASKS: List[Dict[str, Any]] = [] # Collect all download tasks for parallel execution +CONTENT_PROCESSOR: Optional[ContentProcessor] = None USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36' def init_settings(): - global SETTINGS + global SETTINGS, DOWNLOAD_MANAGER, CONTENT_PROCESSOR if SETTINGS is None: SETTINGS = Settings.from_env() + DOWNLOAD_MANAGER = DownloadManager(SETTINGS) + CONTENT_PROCESSOR = ContentProcessor() def http_get(url: str, headers: Optional[Dict[str, str]] = None, timeout: int = 60) -> str: + import time + import urllib.request + import urllib.error + import gzip + init_settings() if SETTINGS is None: raise RuntimeError("Settings not initialized") @@ -43,13 +55,28 @@ def http_get(url: str, headers: Optional[Dict[str, str]] = None, timeout: int = } if headers: request_headers.update(headers) - req = urllib.request.Request(url, headers=request_headers) - with urllib.request.urlopen(req, timeout=timeout) as resp: - data = resp.read() - encoding = resp.headers.get('Content-Encoding', '') - if 'gzip' in encoding: - data = gzip.decompress(data) - return data.decode('utf-8', errors='replace') + + # Retry logic for network reliability + for attempt in range(3): + try: + req = urllib.request.Request(url, headers=request_headers) + with urllib.request.urlopen(req, timeout=15) as resp: + data = resp.read() + encoding = resp.headers.get('Content-Encoding', '') + if 'gzip' in encoding: + data = gzip.decompress(data) + return data.decode('utf-8', errors='replace') + + except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e: + if attempt < 2: # Not last attempt + print(f" โš ๏ธ Network timeout, retrying... (attempt {attempt + 1}/3)") + time.sleep(2) + continue + else: + raise e + + # Should never reach here, but just in case + raise RuntimeError("All retry attempts failed") def download_file_redirect(url: str, file_name: Optional[str] = None): @@ -85,100 +112,613 @@ def download_file_redirect(url: str, file_name: Optional[str] = None): download_file_chunked(final_url, fname) -def download_file_chunked(src_url: str, dst_name: str, chunk_mb: int = 1): - if Path(dst_name).exists(): - return - init_settings() - if SETTINGS is None: - raise RuntimeError("Settings not initialized") - request_headers = { - 'Accept-Encoding': 'identity', # streaming - 'Sec-Fetch-Mode': 'cors', - 'Sec-Fetch-Site': 'cross-site', - 'x-requested-with': 'XMLHttpRequest', - 'x-thinkific-client-date': SETTINGS.client_date, - 'cookie': SETTINGS.cookie_data, - 'User-Agent': USER_AGENT, - } - req = urllib.request.Request(src_url, headers=request_headers) +def add_download_task(url: str, dest_path: Path, content_type: str = "file"): + """Add a download task to the global download queue.""" + global DOWNLOAD_TASKS + if DOWNLOAD_TASKS is None: + DOWNLOAD_TASKS = [] + + # Check if file exists and validate it + should_download = True + if dest_path.exists(): + file_size = dest_path.stat().st_size + + # Always re-download empty or suspiciously small files + if file_size == 0: + print(f"๐Ÿ”„ Re-downloading empty file: {dest_path.name}") + dest_path.unlink() + should_download = True + elif content_type in ['video', 'audio'] and file_size < 1024: + print(f"๐Ÿ”„ Re-downloading corrupt media file: {dest_path.name}") + dest_path.unlink() + should_download = True + elif _validate_existing_file(dest_path, content_type): + print(f"โœ… File already complete: {dest_path.name}") + should_download = False + else: + print(f"๐Ÿ”„ Re-downloading invalid file: {dest_path.name}") + dest_path.unlink() + should_download = True + if should_download: + DOWNLOAD_TASKS.append({ + 'url': url, + 'dest_path': dest_path, + 'content_type': content_type + }) + + +def _validate_existing_file(file_path: Path, content_type: str) -> bool: + """Validate an existing file to determine if re-download is needed.""" try: - with urllib.request.urlopen(req) as resp: - # Get file size for progress bar - content_length = resp.headers.get('Content-Length') - total_size = int(content_length) if content_length else None - - chunk_bytes = chunk_mb * 1024 * 1024 - - # Create progress bar - with tqdm( - total=total_size, - unit='B', - unit_scale=True, - unit_divisor=1024, - desc=f"Downloading {Path(dst_name).name}", - bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]' - ) as pbar: - - with open(dst_name, 'wb') as out: - start_time = time.time() - downloaded = 0 - - while True: - chunk = resp.read(chunk_bytes) - if not chunk: - break - out.write(chunk) - downloaded += len(chunk) - - # Update progress bar - pbar.update(len(chunk)) - - # Calculate and display speed every few chunks - if downloaded % (chunk_bytes * 5) == 0: # Update speed every 5MB - elapsed = time.time() - start_time - if elapsed > 0: - speed = downloaded / elapsed - pbar.set_postfix({'speed': f'{speed/1024/1024:.2f} MB/s'}) - - # Final speed calculation - elapsed = time.time() - start_time - if elapsed > 0: - speed = downloaded / elapsed - print(f"Download completed: {downloaded/1024/1024:.2f} MB in {elapsed:.2f}s (avg: {speed/1024/1024:.2f} MB/s)") + file_size = file_path.stat().st_size + + # Empty files are always invalid + if file_size == 0: + return False + + # Media files need special validation + if content_type in ['video', 'audio'] and file_path.suffix.lower() in ['.mp4', '.mp3', '.wav', '.m4a']: + return _validate_media_file_basic(file_path, file_size) + + # For other files, just check if they're readable + try: + with open(file_path, 'rb') as f: + f.read(1024) # Try to read first 1KB + return True + except: + return False + + except Exception: + return False + + +def _validate_media_file_basic(file_path: Path, file_size: int) -> bool: + """Basic validation for media files.""" + try: + # Too small files are invalid + if file_size < 1024: + return False + + # Check file headers + with open(file_path, 'rb') as f: + header = f.read(16) + + # MP4 validation + if file_path.suffix.lower() == '.mp4': + if not (b'ftyp' in header or b'mdat' in header[:8]): + return False + + # Check if we can read the end (complete file) + try: + f.seek(-min(512, file_size), 2) + f.read(512) + except: + return False + + return True + + except Exception: + return False + + +def execute_parallel_downloads() -> int: + """Execute all queued downloads in parallel and return success count.""" + global DOWNLOAD_TASKS, DOWNLOAD_MANAGER - except Exception as e: - print(f"Download failed for {dst_name}: {e}") - # Clean up partial file - if Path(dst_name).exists(): - Path(dst_name).unlink() + if not DOWNLOAD_TASKS or not DOWNLOAD_MANAGER: + return 0 + + from .download_manager import DownloadTask + + # Convert to DownloadTask objects + tasks = [] + for task_data in DOWNLOAD_TASKS: + task = DownloadTask( + url=task_data['url'], + dest_path=task_data['dest_path'] + ) + tasks.append(task) + + # Execute downloads in parallel + results = DOWNLOAD_MANAGER.download_files_parallel(tasks) + + # Count successful downloads + success_count = sum(1 for result in results if result) + return success_count + + +def download_file_chunked(src_url: str, dst_name: str, chunk_mb: int = 1): + """Queue file for parallel download instead of downloading immediately.""" + global DOWNLOAD_TASKS + dst_path = Path(dst_name) + + # Skip if file already exists + if dst_path.exists(): + return + + # Add to download queue instead of downloading immediately + add_download_task(src_url, dst_path, "file") + def init_course(data: Dict[str, Any]): - global COURSE_CONTENTS, ROOT_PROJECT_DIR, BASE_HOST + """Initialize course structure and collect ALL download tasks first.""" + global COURSE_CONTENTS, ROOT_PROJECT_DIR, BASE_HOST, DOWNLOAD_TASKS + + # Initialize download tasks list + DOWNLOAD_TASKS = [] + course_name = filter_filename(data['course']['name']) prev_dir = Path.cwd() ROOT_PROJECT_DIR = prev_dir - course_dir = Path(course_name) + + # Use output_dir from settings, create it if it doesn't exist + output_dir = Path(SETTINGS.output_dir if SETTINGS else './downloads') + output_dir.mkdir(exist_ok=True, parents=True) + + # Create course directory inside the output directory + course_dir = output_dir / course_name course_dir.mkdir(exist_ok=True) os.chdir(course_dir) COURSE_CONTENTS = data['contents'] + + # Check for resume capability + cache_file = Path('.thinkific_progress.json') + analyzed_chapters = set() + saved_tasks = [] + + if cache_file.exists(): + try: + import json + with open(cache_file, 'r', encoding='utf-8') as f: + cache_data = json.load(f) + analyzed_chapters = set(cache_data.get('analyzed_chapters', [])) + saved_tasks = cache_data.get('download_tasks', []) + print(f"๐Ÿ“‹ Found previous progress: {len(analyzed_chapters)} chapters analyzed, {len(saved_tasks)} tasks cached") + except: + analyzed_chapters = set() + saved_tasks = [] + # Derive base host from landing_page_url if available landing = data['course'].get('landing_page_url') if landing: BASE_HOST = urlparse(landing).hostname - create_chap_folders(data) + + # Phase 1: Create all folders and collect ALL download links + print("\n๐Ÿ” Phase 1: Analyzing course content and collecting download links...") + + # Restore saved download tasks + if saved_tasks: + print(f"๐Ÿ“ฅ Restoring {len(saved_tasks)} previously collected download tasks...") + for task_data in saved_tasks: + add_download_task(task_data['url'], Path(task_data['dest_path']), task_data.get('content_type', 'video')) + + collect_all_download_tasks(data, analyzed_chapters, cache_file) + + # Phase 2: Execute ALL downloads together + if DOWNLOAD_TASKS: + from .progress_manager import print_download_start_banner + + print(f"\n๐Ÿš€ Phase 2: Starting parallel download of {len(DOWNLOAD_TASKS)} files...") + + # Initialize download manager + init_settings() + parallel_workers = SETTINGS.concurrent_downloads if SETTINGS else 3 + print_download_start_banner(len(DOWNLOAD_TASKS), parallel_workers) + + if DOWNLOAD_MANAGER: + import time + start_time = time.time() + success_count = execute_parallel_downloads() + total_time = time.time() - start_time + + if success_count is not None: + from .progress_manager import print_completion_summary + print_completion_summary(success_count, len(DOWNLOAD_TASKS), total_time) + else: + print(f"[INFO] Download process completed in {total_time:.2f}s") + else: + print("[ERROR] Download manager not initialized") + else: + print("[INFO] No files found for download") + os.chdir(prev_dir) -def create_chap_folders(data: Dict[str, Any]): +def collect_all_download_tasks(data: Dict[str, Any], analyzed_chapters = None, cache_file = None): + """Collect ALL download tasks for the entire course without downloading anything.""" + global DOWNLOAD_TASKS + + if analyzed_chapters is None: + analyzed_chapters = set() + + import json + for i, chapter in enumerate(data.get('chapters', []), start=1): + chapter_id = f"chapter_{i}" + + # Skip if already analyzed (for resume) + if chapter_id in analyzed_chapters: + print(f"โญ๏ธ Skipping Chapter {i}: {chapter['name']} (already analyzed)") + continue + chap_folder_name = f"{i}. {filter_filename(chapter['name'])}" - Path(chap_folder_name).mkdir(exist_ok=True) - prev_dir = Path.cwd() - os.chdir(chap_folder_name) - chapterwise_download(chapter['content_ids']) - os.chdir(prev_dir) + chapter_path = Path(chap_folder_name) + chapter_path.mkdir(exist_ok=True) + + print(f"๐Ÿ“ Analyzing Chapter {i}: {chapter['name']}") + + # Collect download tasks for this chapter + collect_chapter_tasks(chapter['content_ids'], chapter_path) + + # Mark as analyzed and save progress + analyzed_chapters.add(chapter_id) + if cache_file: + try: + # Save current download tasks for resume + task_data = [] + for task in DOWNLOAD_TASKS: + task_data.append({ + 'url': task['url'], + 'dest_path': str(task['dest_path']), + 'content_type': task.get('content_type', 'video') + }) + + with open(cache_file, 'w', encoding='utf-8') as f: + json.dump({ + 'analyzed_chapters': list(analyzed_chapters), + 'download_tasks': task_data + }, f, indent=2) + except Exception as e: + print(f" โš ๏ธ Could not save progress: {e}") + pass # Continue even if cache save fails + + +def collect_chapter_tasks(content_ids: Iterable[Any], chapter_path: Path): + """Collect download tasks for a specific chapter.""" + from .wistia_downloader import video_downloader_wistia, video_downloader_videoproxy + global COURSE_CONTENTS, SETTINGS, DOWNLOAD_TASKS + + index = 1 + for content_id in content_ids: + match = next((c for c in COURSE_CONTENTS if c['id'] == content_id), None) + if not match: + print(f" โš ๏ธ No content found for id {content_id}") + index += 1 + continue + + ctype = match.get('contentable_type') or match.get('default_lesson_type_label') + print(f" ๐Ÿ” Found {ctype}: {match.get('name')}") + + # HTML Item (Notes) - Collect download tasks + if ctype == 'HtmlItem': + fname = filter_filename(f"{match['slug']}.html") + dc = chapter_path / filter_filename(f"{index}. {match['name']} Text") + dc.mkdir(exist_ok=True) + + if not (dc / fname).exists(): + j = api_get(f"/api/course_player/v2/html_items/{match['contentable']}") + if j: + html_text = j.get('html_item', {}).get('html_text', '') + decoded = unicode_decode(html_text) + + # Collect MP3 audio files + mp3_matches = MP3_PATTERN.findall(decoded) + if mp3_matches: + for audio_url in set(mp3_matches): + audio_name = filter_filename(Path(urlparse(audio_url).path).name) + add_download_task(audio_url, dc / audio_name, "audio") + + # Save HTML content to file + fname = fname.replace(" ", "-") + (dc / fname).write_text(decoded, encoding='utf-8', errors='replace') + + # Collect video download tasks + videoproxy_matches = VIDEOPROXY_PATTERN.findall(decoded) + if videoproxy_matches: + for video_url in set(videoproxy_matches): + collect_video_task_videoproxy(video_url, filter_filename(match['name']), dc) + + wistia_matches = WISTIA_PATTERN.findall(decoded) + if wistia_matches: + for wistia_id in set(wistia_matches): + collect_video_task_wistia(wistia_id, filter_filename(match['name']), dc) + + index += 1 + continue + + # Multimedia (iframe) - Collect download tasks + if match.get('default_lesson_type_label') == 'Multimedia': + dc = chapter_path / filter_filename(f"{index}. {match['name']} Multimedia") + dc.mkdir(exist_ok=True) + + j = api_get(f"/api/course_player/v2/iframes/{match['contentable']}") + file_contents = '' + if j: + src_url = unicode_decode(j.get('iframe', {}).get('source_url') or '') + if re.search(r"(\.md|\.html|/)$", src_url): + try: + file_contents = http_get(src_url) + except Exception: + file_contents = src_url + else: + file_contents = src_url + + # Collect attached files + if j.get('download_files'): + for download_file in j['download_files']: + download_file_name = filter_filename(download_file.get('label') or 'file') + download_file_url = download_file.get('download_url') + if download_file_url: + add_download_task(download_file_url, dc / download_file_name, "file") + + # Save HTML file + fname = f"{match['name']}.html" + fname = re.sub(r"[^A-Za-z0-9\_\-\. \?]", '', fname) + fname = filter_filename(fname) + (dc / fname).write_text(file_contents, encoding='utf-8', errors='replace') + + index += 1 + continue + + # Lesson (videos + html + attachments) - Collect download tasks + if ctype == 'Lesson': + dc = chapter_path / filter_filename(f"{index}. {match['name']} Lesson") + dc.mkdir(exist_ok=True) + vname = filter_filename(match['name']) + + j = api_get(f"/api/course_player/v2/lessons/{match['contentable']}") + if j: + # Collect video download tasks + videos = j.get('videos') or [] + if videos: + for video in videos: + storage = video.get('storage_location') + identifier = video.get('identifier') + if storage == 'wistia' and identifier: + collect_video_task_wistia(identifier, vname, dc) + elif storage == 'videoproxy' and identifier: + collect_video_task_videoproxy(f"https://platform.thinkific.com/videoproxy/v1/play/{identifier}", vname, dc) + else: + direct = video.get('url') + if direct: + add_download_task(direct, dc / f"{vname}.mp4", "video") + + # Save lesson HTML content + lesson_info = j.get('lesson', {}) + html_text = lesson_info.get('html_text') if isinstance(lesson_info, dict) else None + if html_text and html_text.strip(): + html_filename = f"{vname}.html" + (dc / html_filename).write_text(html_text, encoding='utf-8', errors='replace') + + # Collect attached files + for dlf in j.get('download_files', []) or []: + download_file_name = filter_filename(dlf.get('label') or 'file') + download_file_url = dlf.get('download_url') + if download_file_url: + add_download_task(download_file_url, dc / download_file_name, "file") + + index += 1 + continue + + # PDF - Collect download tasks + if ctype == 'Pdf': + dc = chapter_path / filter_filename(f"{index}. {match['name']}") + dc.mkdir(exist_ok=True) + + j = api_get(f"/api/course_player/v2/pdfs/{match['contentable']}") + if j: + pdf = j.get('pdf', {}) + pdf_url = pdf.get('url') + if pdf_url: + fname = filter_filename(Path(urlparse(pdf_url).path).name) + add_download_task(pdf_url, dc / fname, "pdf") + + index += 1 + continue + + # Download (shared files) - Collect download tasks + if ctype == 'Download': + dc = chapter_path / filter_filename(f"{index}. {match['name']}") + dc.mkdir(exist_ok=True) + + j = api_get(f"/api/course_player/v2/downloads/{match['contentable']}") + if j: + for dlf in j.get('download_files', []) or []: + label = filter_filename(dlf.get('label') or 'file') + url = dlf.get('download_url') + if url: + add_download_task(url, dc / label, "file") + + index += 1 + continue + + # Audio - Collect download tasks + if ctype == 'Audio': + dc = chapter_path / filter_filename(f"{index}. {match['name']}") + dc.mkdir(exist_ok=True) + + j = api_get(f"/api/course_player/v2/audio/{match['contentable']}") + if j: + audio = j.get('audio', {}) + audio_url = audio.get('url') + if audio_url: + fname = filter_filename(Path(urlparse(audio_url).path).name) + add_download_task(audio_url, dc / fname, "audio") + + index += 1 + continue + + # Presentation - Collect download tasks + if ctype == 'Presentation': + dc = chapter_path / filter_filename(f"{index}. {match['name']}") + dc.mkdir(exist_ok=True) + + j = api_get(f"/api/course_player/v2/presentations/{match['contentable']}") + if j: + pres = j.get('presentation', {}) + pdf_url = pres.get('source_file_url') + pdf_name = filter_filename(pres.get('source_file_name') or 'slides.pdf') + if pdf_url: + add_download_task(pdf_url, dc / pdf_name, "presentation") + + # Handle presentation merging - collect slide assets + merge_flag = SETTINGS.ffmpeg_presentation_merge if SETTINGS else False + if merge_flag: + from shutil import which + if which('ffmpeg'): + items = j.get('presentation_items') or [] + for it in items: + pos = it.get('position') + img_url = it.get('image_file_url') + aud_url = it.get('audio_file_url') + if img_url: + img_url = 'https:' + img_url if img_url.startswith('//') else img_url + img_name = filter_filename(f"{pos}{it.get('image_file_name','slide.png')}") + add_download_task(img_url, dc / img_name, "image") + if aud_url: + aud_url = 'https:' + aud_url if aud_url.startswith('//') else aud_url + aud_name = filter_filename(f"{pos}{it.get('audio_file_name','audio.m4a')}") + add_download_task(aud_url, dc / aud_name, "audio") + + index += 1 + continue + + # Quiz - Handle separately (complex logic) + if ctype == 'Quiz': + dc = chapter_path / filter_filename(f"{index}. {match['name']} Quiz") + dc.mkdir(exist_ok=True) + + fname = filter_filename(f"{match['name']} Answers.html") + qname = filter_filename(f"{match['name']} Questions.html") + + result = api_get(f"/api/course_player/v2/quizzes/{match['contentable']}") + if result: + file_contents_with_answers = "

Answers of this Quiz are marked in RED

" + file_contents_with_questions = "" + + for qs in result.get("questions", []): + choice = 'A' + position = qs.get("position", 0) + 1 + prompt = unicode_decode(qs.get("prompt", "")) + explanation = unicode_decode(qs.get("text_explanation", "")) + + file_contents_with_answers += f"{position}) {prompt} Explanation: {explanation}

" + + # Collect embedded video tasks + wistia_matches = WISTIA_PATTERN.findall(prompt) + if wistia_matches: + for wistia_match in set(wistia_matches): + collect_video_task_wistia(wistia_match, f"QA Video {position}", dc) + + file_contents_with_questions += f"{position}) {prompt}

" + + for ch in result.get("choices", []): + if ch.get("question_id") == qs.get("id"): + try: + import base64 + ans = base64.b64decode(ch.get("credited", "")).decode('utf-8', 'ignore') + ans = re.sub(r'\d', '', ans) + except Exception: + ans = "" + + choice_text = unicode_decode(ch.get("text", "")) + if ans == "true": + file_contents_with_questions += f"{choice}) {choice_text}
" + file_contents_with_answers += f"{choice}) {choice_text}
" + else: + file_contents_with_questions += f"{choice}) {choice_text}
" + file_contents_with_answers += f"{choice}) {choice_text}
" + + choice = chr(ord(choice) + 1) + + file_contents_with_questions += "
" + file_contents_with_answers += "
" + + (dc / qname).write_text(file_contents_with_questions, encoding='utf-8', errors='replace') + (dc / fname).write_text(file_contents_with_answers, encoding='utf-8', errors='replace') + + index += 1 + continue + + # Assignment/Survey placeholders + if ctype in ['Assignment', 'Survey']: + print(f" โš ๏ธ {ctype} content type not yet implemented: {match['name']}") + index += 1 + continue + + index += 1 + + +def collect_video_task_wistia(wistia_id: str, file_name: str, dest_dir: Path): + """Collect Wistia video download task.""" + try: + from urllib.request import urlopen + from urllib.error import URLError, HTTPError + import json + import time + + # Get video info from Wistia API with retry logic + api_url = f"https://fast.wistia.com/embed/medias/{wistia_id}.json" + + data = None + for attempt in range(3): # 3 retry attempts + try: + with urlopen(api_url, timeout=15) as response: + data = json.loads(response.read().decode()) + break # Success, exit retry loop + + except (URLError, HTTPError, TimeoutError) as e: + if attempt < 2: # Not last attempt + print(f" โš ๏ธ Network timeout, retrying... (attempt {attempt + 1}/3)") + time.sleep(2) # Wait 2 seconds before retry + continue + else: + print(f" โŒ Failed to get video info after 3 attempts: {file_name}") + return + + if not data: + return + + assets = data.get('media', {}).get('assets', []) + if not assets: + return + + # Find best quality video + video_assets = [a for a in assets if a.get('type') == 'original'] + if not video_assets: + video_assets = [a for a in assets if a.get('type') in ['mp4_720', 'mp4_540', 'mp4_360']] + + if video_assets: + selected = video_assets[0] + video_url = selected.get('url') + if video_url: + ext = '.mp4' # Default extension + resolved_name = filter_filename(file_name) + ext + print(f" ๐Ÿ“น Found video: {resolved_name}") + add_download_task(video_url, dest_dir / resolved_name, "video") + except Exception as e: + print(f" โŒ Failed to collect Wistia video {wistia_id}: {e}") + + +def collect_video_task_videoproxy(video_url: str, file_name: str, dest_dir: Path): + """Collect videoproxy download task.""" + try: + from .wistia_downloader import VIDEO_PROXY_JSONP_ID_PATTERN + + video_html_frame = http_get(video_url) + match = VIDEO_PROXY_JSONP_ID_PATTERN.search(video_html_frame) + if match: + wistia_id = match.group(1) + collect_video_task_wistia(wistia_id, file_name, dest_dir) + except Exception as e: + print(f" โŒ Failed to collect videoproxy video: {e}") + + +def create_chap_folders(data: Dict[str, Any]): + """Legacy function - now handled by collect_all_download_tasks.""" + pass # Patterns reused @@ -195,10 +735,12 @@ def api_get(endpoint: str) -> Optional[Dict[str, Any]]: print('Base host unknown; cannot call API:', endpoint) return None url = f"https://{BASE_HOST}{endpoint}" - print(f"[API] Fetching: {url}") + if SETTINGS and SETTINGS.debug: + print(f"[API] Fetching: {url}") try: raw = http_get(url) - print(f"[API] Response (first 200 chars): {raw[:200]}") + if SETTINGS and SETTINGS.debug: + print(f"[API] Response (first 200 chars): {raw[:200]}") return json.loads(raw) except Exception as e: print(f"API GET failed {endpoint}: {e}") @@ -206,8 +748,17 @@ def api_get(endpoint: str) -> Optional[Dict[str, Any]]: def chapterwise_download(content_ids: Iterable[Any]): + """Process all content and queue downloads, then execute in parallel batches.""" from .wistia_downloader import video_downloader_wistia, video_downloader_videoproxy # local import - global COURSE_CONTENTS, SETTINGS, ROOT_PROJECT_DIR + from .progress_manager import print_completion_summary + + global COURSE_CONTENTS, SETTINGS, ROOT_PROJECT_DIR, DOWNLOAD_TASKS + + # Initialize and clear any existing download tasks + init_settings() + DOWNLOAD_TASKS = [] + + # Phase 1: Process all content and queue downloads (no actual downloading) index = 1 for content_id in content_ids: match = next((c for c in COURSE_CONTENTS if c['id'] == content_id), None) @@ -216,103 +767,101 @@ def chapterwise_download(content_ids: Iterable[Any]): index += 1 continue ctype = match.get('contentable_type') or match.get('default_lesson_type_label') - print(f"[INFO] Processing content id {content_id} type {ctype} name {match.get('name')}") + print(f"[QUEUE] Processing content id {content_id} type {ctype} name {match.get('name')}") - # HTML Item (Notes) + # HTML Item (Notes) - Queue downloads if ctype == 'HtmlItem': fname = filter_filename(f"{match['slug']}.html") - if Path(fname).exists(): - print("File already exists, skipping") - index += 1 - continue dc = filter_filename(f"{index}. {match['name']} Text") Path(dc).mkdir(exist_ok=True) prev = Path.cwd(); os.chdir(dc) - print(f"Downloading {match['name']}") - j = api_get(f"/api/course_player/v2/html_items/{match['contentable']}") - if j: - html_text = j.get('html_item', {}).get('html_text', '') - decoded = unicode_decode(html_text) - - # Extract videoproxy links - videoproxy_matches = VIDEOPROXY_PATTERN.findall(decoded) - if videoproxy_matches: - print("Found Videoproxy in HTML Item") - for video_url in set(videoproxy_matches): - video_downloader_videoproxy(video_url, filter_filename(match['name']), SETTINGS.video_download_quality if SETTINGS else '720p') - - # Extract MP3 audio files - mp3_matches = MP3_PATTERN.findall(decoded) - if mp3_matches: - print("Found Audios in HTML Item") - for audio_url in set(mp3_matches): - audio_name = filter_filename(Path(urlparse(audio_url).path).name) - download_file_chunked(audio_url, audio_name) - - # Extract Wistia videos - wistia_matches = WISTIA_PATTERN.findall(decoded) - if wistia_matches: - print("Found Wistia Videos in HTML Item") - for wistia_id in set(wistia_matches): - video_downloader_wistia(wistia_id, filter_filename(match['name']), SETTINGS.video_download_quality if SETTINGS else '720p') - - # Save HTML content to file - fname = fname.replace(" ", "-") # PHP replaces spaces with dashes - Path(fname).write_text(decoded, encoding='utf-8', errors='replace') + + if not Path(fname).exists(): + j = api_get(f"/api/course_player/v2/html_items/{match['contentable']}") + if j: + html_text = j.get('html_item', {}).get('html_text', '') + decoded = unicode_decode(html_text) + + # Queue MP3 audio files with absolute paths + mp3_matches = MP3_PATTERN.findall(decoded) + if mp3_matches: + current_dir = Path.cwd() + for audio_url in set(mp3_matches): + audio_name = filter_filename(Path(urlparse(audio_url).path).name) + add_download_task(audio_url, current_dir / audio_name, "audio") + + # Save HTML content to file + fname = fname.replace(" ", "-") + Path(fname).write_text(decoded, encoding='utf-8', errors='replace') + + # Handle video downloads - queue them instead of downloading immediately + videoproxy_matches = VIDEOPROXY_PATTERN.findall(decoded) + if videoproxy_matches: + for video_url in set(videoproxy_matches): + # Extract video info and queue for download + from .wistia_downloader import video_downloader_videoproxy + video_downloader_videoproxy(video_url, filter_filename(match['name']), SETTINGS.video_download_quality if SETTINGS else '720p') + + wistia_matches = WISTIA_PATTERN.findall(decoded) + if wistia_matches: + for wistia_id in set(wistia_matches): + # Extract video info and queue for download + from .wistia_downloader import video_downloader_wistia + video_downloader_wistia(wistia_id, filter_filename(match['name']), SETTINGS.video_download_quality if SETTINGS else '720p') + os.chdir(prev) index += 1 continue - # Multimedia (iframe) + # Multimedia (iframe) - Queue downloads if match.get('default_lesson_type_label') == 'Multimedia': dc = filter_filename(f"{index}. {match['name']} Multimedia") Path(dc).mkdir(exist_ok=True) prev = Path.cwd(); os.chdir(dc) - print(f"Downloading {match['name']}") + j = api_get(f"/api/course_player/v2/iframes/{match['contentable']}") file_contents = '' if j: src_url = unicode_decode(j.get('iframe', {}).get('source_url') or '') - # PHP logic: if URL contains .md, .html, or ends with /, try to fetch content if re.search(r"(\.md|\.html|/)$", src_url): try: file_contents = http_get(src_url) except Exception: - print("Not a valid documents, continuing") file_contents = src_url else: file_contents = src_url - # Download attached files + # Queue attached files with absolute paths if j.get('download_files'): + current_dir = Path.cwd() for download_file in j['download_files']: download_file_name = filter_filename(download_file.get('label') or 'file') download_file_url = download_file.get('download_url') if download_file_url: - download_file_chunked(download_file_url, download_file_name) + add_download_task(download_file_url, current_dir / download_file_name, "file") - # Save to HTML file (PHP logic) + # Save HTML file fname = f"{match['name']}.html" - fname = re.sub(r"[^A-Za-z0-9\_\-\. \?]", '', fname) # PHP filename sanitization + fname = re.sub(r"[^A-Za-z0-9\_\-\. \?]", '', fname) fname = filter_filename(fname) Path(fname).write_text(file_contents, encoding='utf-8', errors='replace') + os.chdir(prev) index += 1 continue - # Lesson (videos + html + attachments) + # Lesson (videos + html + attachments) - Queue downloads if ctype == 'Lesson': dc = filter_filename(f"{index}. {match['name']} Lesson") Path(dc).mkdir(exist_ok=True) prev = Path.cwd(); os.chdir(dc) vname = filter_filename(match['name']) - print(f"Downloading Video : {vname}") + j = api_get(f"/api/course_player/v2/lessons/{match['contentable']}") if j: + # Handle videos - queue them for parallel download videos = j.get('videos') or [] - if not videos: - print('No Lesson Videos found for', vname) - else: + if videos: for video in videos: storage = video.get('storage_location') identifier = video.get('identifier') @@ -321,166 +870,123 @@ def chapterwise_download(content_ids: Iterable[Any]): elif storage == 'videoproxy' and identifier: video_downloader_videoproxy(f"https://platform.thinkific.com/videoproxy/v1/play/{identifier}", vname, SETTINGS.video_download_quality if SETTINGS else '720p') else: - print(f"Unknown video storage location. Trying Native Method for {vname}") direct = video.get('url') if direct: - download_file_redirect(direct, vname) + current_dir = Path.cwd() + add_download_task(direct, current_dir / f"{vname}.mp4", "video") - # Save lesson HTML content if exists (PHP logic) + # Save lesson HTML content lesson_info = j.get('lesson', {}) html_text = lesson_info.get('html_text') if isinstance(lesson_info, dict) else None - if html_text and html_text.strip(): # PHP checks if not empty - print(f"Saving HTML Text for {vname}") + if html_text and html_text.strip(): html_filename = f"{vname}.html" Path(html_filename).write_text(html_text, encoding='utf-8', errors='replace') - # Download attached files + # Queue attached files with absolute paths for dlf in j.get('download_files', []) or []: download_file_name = filter_filename(dlf.get('label') or 'file') download_file_url = dlf.get('download_url') if download_file_url: - download_file_chunked(download_file_url, download_file_name) + current_dir = Path.cwd() + add_download_task(download_file_url, current_dir / download_file_name, "file") + os.chdir(prev); index += 1; continue - # Pdf + # PDF - Queue downloads if ctype == 'Pdf': dc = filter_filename(f"{index}. {match['name']}") Path(dc).mkdir(exist_ok=True) prev = Path.cwd(); os.chdir(dc) - print(f"Downloading {match['name']} (PDF)") + j = api_get(f"/api/course_player/v2/pdfs/{match['contentable']}") if j: pdf = j.get('pdf', {}) pdf_url = pdf.get('url') if pdf_url: + current_dir = Path.cwd() fname = filter_filename(Path(urlparse(pdf_url).path).name) - download_file_chunked(pdf_url, fname) + add_download_task(pdf_url, current_dir / fname, "pdf") + os.chdir(prev); index += 1; continue - # Download (shared files) + # Download (shared files) - Queue downloads if ctype == 'Download': dc = filter_filename(f"{index}. {match['name']}") Path(dc).mkdir(exist_ok=True) prev = Path.cwd(); os.chdir(dc) - print(f"Downloading {match['name']} (Files)") + j = api_get(f"/api/course_player/v2/downloads/{match['contentable']}") if j: + current_dir = Path.cwd() for dlf in j.get('download_files', []) or []: label = filter_filename(dlf.get('label') or 'file') url = dlf.get('download_url') if url: - download_file_chunked(url, label) + add_download_task(url, current_dir / label, "file") + os.chdir(prev); index += 1; continue - # Audio + # Audio - Queue downloads if ctype == 'Audio': dc = filter_filename(f"{index}. {match['name']}") Path(dc).mkdir(exist_ok=True) prev = Path.cwd(); os.chdir(dc) - print(f"Downloading {match['name']} (Audio)") + j = api_get(f"/api/course_player/v2/audio/{match['contentable']}") if j: audio = j.get('audio', {}) audio_url = audio.get('url') if audio_url: + current_dir = Path.cwd() fname = filter_filename(Path(urlparse(audio_url).path).name) - download_file_chunked(audio_url, fname) + add_download_task(audio_url, current_dir / fname, "audio") + os.chdir(prev); index += 1; continue - # Presentation + # Presentation - Queue downloads if ctype == 'Presentation': dc = filter_filename(f"{index}. {match['name']}") Path(dc).mkdir(exist_ok=True) prev = Path.cwd(); os.chdir(dc) - print(f"Downloading {match['name']} (Presentation)") + j = api_get(f"/api/course_player/v2/presentations/{match['contentable']}") if j: pres = j.get('presentation', {}) pdf_url = pres.get('source_file_url') pdf_name = filter_filename(pres.get('source_file_name') or 'slides.pdf') if pdf_url: - download_file_chunked(pdf_url, pdf_name) - # Optional merging if ffmpeg available and flag is set + current_dir = Path.cwd() + add_download_task(pdf_url, current_dir / pdf_name, "presentation") + + # Handle presentation merging separately (complex ffmpeg logic not parallelized) merge_flag = SETTINGS.ffmpeg_presentation_merge if SETTINGS else False if merge_flag: - # Detect ffmpeg availability from shutil import which - if which('ffmpeg') is None: - print('ffmpeg not found in PATH; skipping merge. Install ffmpeg or disable flag.') - else: + if which('ffmpeg'): items = j.get('presentation_items') or [] - # Download images & audio (with position prefix) first - print('Downloading slide images/audio for merge') + current_dir = Path.cwd() + # Queue slide images and audio files for it in items: pos = it.get('position') img_url = it.get('image_file_url') aud_url = it.get('audio_file_url') if img_url: - download_file_chunked('https:' + img_url if img_url.startswith('//') else img_url, - filter_filename(f"{pos}{it.get('image_file_name','slide.png')}") ) - if aud_url: - download_file_chunked('https:' + aud_url if aud_url.startswith('//') else aud_url, - filter_filename(f"{pos}{it.get('audio_file_name','audio.m4a')}") ) - # Build per-slide videos - print('Merging slides to per-slide videos') - list_entries = [] - for it in items: - pos = it.get('position') - img_name = filter_filename(f"{pos}{it.get('image_file_name','slide.png')}") - aud_name = filter_filename(f"{pos}{it.get('audio_file_name','audio.m4a')}") if it.get('audio_file_url') else None - slide_video = filter_filename(f"{pos}-slide.mp4") - if Path(slide_video).exists(): - list_entries.append(slide_video) - continue - # Build ffmpeg command - if aud_name and Path(aud_name).exists(): - cmd = f'ffmpeg -r 1 -loop 1 -y -i "{img_name}" -i "{aud_name}" -c:a copy -r 1 -vcodec libx264 -shortest "{slide_video}" -hide_banner -loglevel error' - else: - cmd = f'ffmpeg -r 1 -loop 1 -t 5 -y -i "{img_name}" -f lavfi -i anullsrc -c:a aac -r 1 -vcodec libx264 -shortest "{slide_video}" -hide_banner -loglevel error' - print(cmd) - os.system(cmd) - if Path(slide_video).exists(): - list_entries.append(slide_video) - if list_entries: - # Write list.txt - with open('list.txt','w', encoding='utf-8') as lf: - for f in list_entries: - lf.write(f"file '{f}'\n") - merged_name = filter_filename(f"{match['contentable']}-{match['name']}-merged.mp4") - if not Path(merged_name).exists(): - concat_cmd = f'ffmpeg -n -f concat -safe 0 -i list.txt -c copy "{merged_name}" -hide_banner' - print(concat_cmd) - os.system(concat_cmd) - # Clean intermediates - for f in list_entries: - try: - Path(f).unlink() - except Exception: - pass - try: - Path('list.txt').unlink() - except Exception: - pass - # Remove slide assets (images/audio) - for it in items: - pos = it.get('position') + img_url = 'https:' + img_url if img_url.startswith('//') else img_url img_name = filter_filename(f"{pos}{it.get('image_file_name','slide.png')}") - if Path(img_name).exists(): - try: Path(img_name).unlink() - except Exception: pass - if it.get('audio_file_url'): - aud_name = filter_filename(f"{pos}{it.get('audio_file_name','audio.m4a')}") - if Path(aud_name).exists(): - try: Path(aud_name).unlink() - except Exception: pass + add_download_task(img_url, current_dir / img_name, "image") + if aud_url: + aud_url = 'https:' + aud_url if aud_url.startswith('//') else aud_url + aud_name = filter_filename(f"{pos}{it.get('audio_file_name','audio.m4a')}") + add_download_task(aud_url, current_dir / aud_name, "audio") + os.chdir(prev); index += 1; continue - # Quiz + # Quiz - Handle separately (complex logic) if ctype == 'Quiz': - print(f"Downloading {match['name']}") dc = filter_filename(f"{index}. {match['name']} Quiz") Path(dc).mkdir(exist_ok=True) prev = Path.cwd(); os.chdir(dc) + fname = filter_filename(f"{match['name']} Answers.html") qname = filter_filename(f"{match['name']} Questions.html") @@ -489,16 +995,15 @@ def chapterwise_download(content_ids: Iterable[Any]): file_contents_with_answers = "

Answers of this Quiz are marked in RED

" file_contents_with_questions = "" - # Process questions (PHP logic) for qs in result.get("questions", []): choice = 'A' - position = qs.get("position", 0) + 1 # PHP increments position by 1 + position = qs.get("position", 0) + 1 prompt = unicode_decode(qs.get("prompt", "")) explanation = unicode_decode(qs.get("text_explanation", "")) file_contents_with_answers += f"{position}) {prompt} Explanation: {explanation}

" - # Extract embedded Wistia videos from prompt (PHP logic) + # Handle embedded videos - queue them for parallel download wistia_matches = WISTIA_PATTERN.findall(prompt) if wistia_matches: for wistia_match in set(wistia_matches): @@ -506,13 +1011,12 @@ def chapterwise_download(content_ids: Iterable[Any]): file_contents_with_questions += f"{position}) {prompt}

" - # Process choices for this question for ch in result.get("choices", []): if ch.get("question_id") == qs.get("id"): try: import base64 ans = base64.b64decode(ch.get("credited", "")).decode('utf-8', 'ignore') - ans = re.sub(r'\d', '', ans) # Remove digits + ans = re.sub(r'\d', '', ans) except Exception: ans = "" @@ -524,12 +1028,11 @@ def chapterwise_download(content_ids: Iterable[Any]): file_contents_with_questions += f"{choice}) {choice_text}
" file_contents_with_answers += f"{choice}) {choice_text}
" - choice = chr(ord(choice) + 1) # Increment choice letter + choice = chr(ord(choice) + 1) file_contents_with_questions += "
" file_contents_with_answers += "
" - # Write both files (PHP logic) Path(qname).write_text(file_contents_with_questions, encoding='utf-8', errors='replace') Path(fname).write_text(file_contents_with_answers, encoding='utf-8', errors='replace') @@ -537,19 +1040,36 @@ def chapterwise_download(content_ids: Iterable[Any]): index += 1 continue - # Assignment (placeholder - currently planned) - if ctype == 'Assignment': - print(f"Assignment content type not yet implemented: {match['name']}") - index += 1 - continue - - # Survey (placeholder - currently planned) - if ctype == 'Survey': - print(f"Survey content type not yet implemented: {match['name']}") + # Assignment/Survey placeholders + if ctype in ['Assignment', 'Survey']: + print(f"{ctype} content type not yet implemented: {match['name']}") index += 1 continue index += 1 + + # Phase 2: Execute all queued downloads in parallel + if DOWNLOAD_TASKS: + from .progress_manager import print_download_start_banner + + print(f"\n[PARALLEL] Starting parallel download of {len(DOWNLOAD_TASKS)} files...") + parallel_workers = SETTINGS.concurrent_downloads if SETTINGS else 3 + print_download_start_banner(len(DOWNLOAD_TASKS), parallel_workers) + + if DOWNLOAD_MANAGER: + import time + start_time = time.time() + success_count = execute_parallel_downloads() + total_time = time.time() - start_time + + if success_count is not None: + print_completion_summary(success_count, len(DOWNLOAD_TASKS), total_time) + else: + print(f"[INFO] Download process completed in {total_time:.2f}s") + else: + print("[ERROR] Download manager not initialized") + else: + print("[INFO] No files queued for download") def handler(course_url: str): @@ -566,7 +1086,8 @@ def handler(course_url: str): def main(argv: List[str]): - print("THINKIFIC DOWNLOADER\nPython Port (Core)\nAuthor: Ported by Assistant\n") + print_banner() + # Ensure .env is loaded before checking COURSE_URL/COURSE_LINK try: load_env() @@ -577,38 +1098,44 @@ def main(argv: List[str]): course_url_env_alt = os.getenv('COURSE_LINK') effective_course_url_env = course_url_env_primary or course_url_env_alt - if ('--json' in argv and len(argv) > 2) or os.getenv('COURSE_DATA_FILE'): - if '--json' in argv: - json_path = Path(argv[argv.index('--json') + 1]) - if not json_path.exists(): - print(f"File not found: {json_path}") - return - print('Using Custom Metadata File for course data.') - data = json.loads(json_path.read_text(encoding='utf-8')) - else: - course_data_file = os.getenv('COURSE_DATA_FILE') - if not course_data_file: - print('COURSE_DATA_FILE env var not set.') - return - json_path = Path(course_data_file) - if not json_path.exists(): - print(f"File not found: {json_path}") - return - print('Loading Custom Metadata File from env for course data.') - data = json.loads(json_path.read_text(encoding='utf-8')) - init_course(data) - elif len(argv) > 1: - course_url = argv[1] - handler(course_url) - else: - if effective_course_url_env: - print(f"Using course url from env: { 'COURSE_URL' if course_url_env_primary else 'COURSE_LINK' }") - handler(effective_course_url_env) + try: + if ('--json' in argv and len(argv) > 2) or os.getenv('COURSE_DATA_FILE'): + if '--json' in argv: + json_path = Path(argv[argv.index('--json') + 1]) + if not json_path.exists(): + print(f"File not found: {json_path}") + return + print('Using Custom Metadata File for course data.') + data = json.loads(json_path.read_text(encoding='utf-8')) + else: + course_data_file = os.getenv('COURSE_DATA_FILE') + if not course_data_file: + print('COURSE_DATA_FILE env var not set.') + return + json_path = Path(course_data_file) + if not json_path.exists(): + print(f"File not found: {json_path}") + return + print('Loading Custom Metadata File from env for course data.') + data = json.loads(json_path.read_text(encoding='utf-8')) + init_course(data) + elif len(argv) > 1: + course_url = argv[1] + handler(course_url) else: - print('No course URL resolved.') - print('Usage for using course url: python thinkidownloader3.py ') - print('Or set COURSE_URL=... (fallback: COURSE_LINK=...) in .env') - print('Usage for selective download: python thinkidownloader3.py --json ') + if effective_course_url_env: + print(f"Using course url from env: { 'COURSE_URL' if course_url_env_primary else 'COURSE_LINK' }") + handler(effective_course_url_env) + else: + print('No course URL resolved.') + print('Usage for using course url: python thinkidownloader3.py ') + print('Or set COURSE_URL=... (fallback: COURSE_LINK=...) in .env') + print('Usage for selective download: python thinkidownloader3.py --json ') + finally: + # Clean up download manager + global DOWNLOAD_MANAGER + if DOWNLOAD_MANAGER: + DOWNLOAD_MANAGER.close() if __name__ == '__main__': diff --git a/thinkific_downloader/progress_manager.py b/thinkific_downloader/progress_manager.py new file mode 100644 index 0000000..973edcf --- /dev/null +++ b/thinkific_downloader/progress_manager.py @@ -0,0 +1,228 @@ +import sys +from typing import List, Dict, Any, Optional +from pathlib import Path +from rich.console import Console +from rich.progress import Progress, TaskID, TextColumn, BarColumn, TimeRemainingColumn, TransferSpeedColumn, DownloadColumn +from rich.panel import Panel +from rich.text import Text +from rich.layout import Layout +from rich.live import Live + +console = Console() + +class ProgressDisplay: + """Manages rich progress display for downloads.""" + + def __init__(self): + self.progress = Progress( + TextColumn("[bold blue]{task.fields[filename]}", justify="right"), + BarColumn(bar_width=40), + "[progress.percentage]{task.percentage:>3.1f}%", + "โ€ข", + DownloadColumn(), + "โ€ข", + TransferSpeedColumn(), + "โ€ข", + TimeRemainingColumn(), + console=console, + transient=True + ) + self.tasks: Dict[str, TaskID] = {} + + def add_task(self, filename: str, total_size: Optional[int] = None) -> TaskID: + """Add a download task to the progress display.""" + task_id = self.progress.add_task( + "download", + filename=filename, + total=total_size + ) + self.tasks[filename] = task_id + return task_id + + def update_task(self, filename: str, advance: int = 0, **kwargs): + """Update progress for a specific task.""" + if filename in self.tasks: + self.progress.update(self.tasks[filename], advance=advance, **kwargs) + + def complete_task(self, filename: str): + """Mark a task as completed.""" + if filename in self.tasks: + self.progress.update(self.tasks[filename], completed=True) + + def start(self): + """Start the progress display.""" + self.progress.start() + + def stop(self): + """Stop the progress display.""" + self.progress.stop() + + +class ContentProcessor: + """Handles content processing with cleaner output.""" + + def __init__(self): + self.processed_items = [] + self.download_queue = [] + + def process_content_item(self, item: Dict[str, Any], index: int) -> Dict[str, Any]: + """Process a content item and collect download tasks.""" + content_type = item.get('contentable_type') or item.get('default_lesson_type_label', 'Unknown') + name = item.get('name', 'Untitled') + + # Create a clean summary + summary = { + 'index': index, + 'name': name, + 'type': content_type, + 'files': [], + 'status': 'pending' + } + + # Log the processing + console.print(f"[cyan]๐Ÿ“‹ Processing:[/cyan] {content_type} - {name}", style="dim") + + # Collect files to download based on content type + files_to_download = self._get_files_for_content_type(item, content_type) + summary['files'] = files_to_download + + # Add to download queue + self.download_queue.extend(files_to_download) + + self.processed_items.append(summary) + return summary + + def _get_files_for_content_type(self, item: Dict[str, Any], content_type: str) -> List[Dict[str, Any]]: + """Get list of files to download for a specific content type.""" + files = [] + + if content_type == 'Lesson': + # Video files + files.append({ + 'type': 'video', + 'url': f"wistia:{item.get('contentable')}", # Placeholder + 'filename': f"{item.get('name', 'video')}.mp4", + 'size_estimate': '100-500MB' + }) + + elif content_type == 'Pdf': + files.append({ + 'type': 'pdf', + 'url': f"pdf:{item.get('contentable')}", # Placeholder + 'filename': f"{item.get('name', 'document')}.pdf", + 'size_estimate': '1-10MB' + }) + + elif content_type == 'HtmlItem': + files.append({ + 'type': 'html', + 'url': f"html:{item.get('contentable')}", # Placeholder + 'filename': f"{item.get('name', 'content')}.html", + 'size_estimate': '<1MB' + }) + + elif content_type == 'Audio': + files.append({ + 'type': 'audio', + 'url': f"audio:{item.get('contentable')}", # Placeholder + 'filename': f"{item.get('name', 'audio')}.mp3", + 'size_estimate': '5-50MB' + }) + + return files + + def print_summary(self): + """Print a summary of all processed content.""" + if not self.processed_items: + console.print("[yellow]No content items processed[/yellow]") + return + + # Create summary panel + summary_text = Text() + summary_text.append("๐Ÿ“Š Content Summary\n", style="bold cyan") + + type_counts = {} + total_files = 0 + + for item in self.processed_items: + content_type = item['type'] + type_counts[content_type] = type_counts.get(content_type, 0) + 1 + total_files += len(item['files']) + + summary_text.append(f"Total Items: {len(self.processed_items)}\n", style="white") + summary_text.append(f"Total Files: {total_files}\n", style="white") + summary_text.append("\nContent Types:\n", style="bold white") + + for content_type, count in type_counts.items(): + emoji = self._get_type_emoji(content_type) + summary_text.append(f" {emoji} {content_type}: {count}\n", style="green") + + panel = Panel(summary_text, title="Processing Summary", border_style="blue") + console.print(panel) + + def _get_type_emoji(self, content_type: str) -> str: + """Get emoji for content type.""" + emoji_map = { + 'Lesson': '๐ŸŽฅ', + 'Pdf': '๐Ÿ“„', + 'HtmlItem': '๐Ÿ“', + 'Audio': '๐ŸŽต', + 'Quiz': '๐Ÿ“', + 'Download': '๐Ÿ“', + 'Presentation': '๐ŸŽจ', + 'Multimedia': '๐Ÿ–ผ๏ธ' + } + return emoji_map.get(content_type, '๐Ÿ“‹') + + +def print_banner(): + """Print a clean banner.""" + banner_text = Text() + banner_text.append("๐Ÿš€ THINKIFIC DOWNLOADER\n", style="bold cyan") + banner_text.append("Enhanced with Parallel Downloads & Rich UI\n", style="green") + + panel = Panel( + banner_text, + title="Starting Download", + border_style="cyan", + padding=(1, 2) + ) + console.print(panel) + + +def print_download_start_banner(total_files: int, parallel_workers: int): + """Print banner before starting downloads.""" + info_text = Text() + info_text.append(f"๐Ÿ“ Files to download: {total_files}\n", style="white") + info_text.append(f"๐Ÿ”„ Parallel workers: {parallel_workers}\n", style="white") + info_text.append(f"โšก Enhanced features: Rate limiting, Resume, Validation\n", style="green") + + panel = Panel( + info_text, + title="Download Configuration", + border_style="green", + padding=(1, 2) + ) + console.print(panel) + + +def print_completion_summary(successful: int, failed: int, total_time: float): + """Print completion summary.""" + status_text = Text() + + if failed == 0: + status_text.append("๐ŸŽ‰ All downloads completed successfully!\n", style="bold green") + else: + status_text.append(f"โš ๏ธ Downloads completed with {failed} failures\n", style="bold yellow") + + status_text.append(f"โœ… Successful: {successful}\n", style="green") + status_text.append(f"โŒ Failed: {failed}\n", style="red" if failed > 0 else "dim") + status_text.append(f"โฑ๏ธ Total time: {total_time:.1f}s\n", style="blue") + + panel = Panel( + status_text, + title="Download Complete", + border_style="green" if failed == 0 else "yellow", + padding=(1, 2) + ) + console.print(panel) \ No newline at end of file diff --git a/thinkific_downloader/wistia_downloader.py b/thinkific_downloader/wistia_downloader.py index a087956..5ec6423 100644 --- a/thinkific_downloader/wistia_downloader.py +++ b/thinkific_downloader/wistia_downloader.py @@ -6,6 +6,7 @@ from pathlib import Path import os from .file_utils import filter_filename +from .download_manager import DownloadManager # Local imports inside functions to avoid circular dependency during module import # Handles video proxy and wistia direct downloads @@ -32,7 +33,11 @@ def video_downloader_wistia(wistia_id: str, file_name: Optional[str] = None, qua automatically decompressed by urllib. Falls back to selecting first asset if desired quality not present. """ - from .downloader import download_file_chunked # delayed import (avoid circular) + from .downloader import DOWNLOAD_MANAGER # delayed import + + if not DOWNLOAD_MANAGER: + from .downloader import init_settings + init_settings() json_url = WISTIA_JSON_URL.format(id=wistia_id) @@ -132,7 +137,10 @@ def infer_ext(asset: dict) -> str: if all_formats_flag: print(f"Downloading all available Wistia assets for {resolved_base}") - from .downloader import download_file_chunked # local import inside to avoid circular earlier + from .downloader import DOWNLOAD_MANAGER # local import inside to avoid circular earlier + if not DOWNLOAD_MANAGER: + from .downloader import init_settings + init_settings() seen: List[str] = [] for asset in assets: a_url = asset.get('url') @@ -151,7 +159,10 @@ def infer_ext(asset: dict) -> str: if not out_name.endswith(ext): out_name += ext print(f"Asset: {display} -> {a_url}") - download_file_chunked(a_url, filter_filename(out_name)) + if DOWNLOAD_MANAGER: + DOWNLOAD_MANAGER.download_file(a_url, Path(filter_filename(out_name))) + else: + print("Download manager not initialized") return # Single quality path @@ -177,4 +188,9 @@ def infer_ext(asset: dict) -> str: ext = '.mp4' # Default fallback resolved_name = resolved_base + (ext if not resolved_base.endswith(ext) else '') print(f"URL : {video_url}\nFile Name : {resolved_name}") - download_file_chunked(video_url, resolved_name) + + # Queue video for parallel download with absolute path to current directory + from .downloader import add_download_task + current_dir = Path.cwd() # Capture current working directory + full_path = current_dir / resolved_name # Create absolute path + add_download_task(video_url, full_path, "video")