diff --git a/.env.example b/.env.example index 17c0873..06d3c77 100644 --- a/.env.example +++ b/.env.example @@ -1,14 +1,13 @@ # Thinkific-Downloader Environment Configuration # Copy this file to .env and fill in your actual values +# =============================================== +# REQUIRED AUTHENTICATION +# =============================================== + # For downloading all content, use the course link. COURSE_LINK="https://your-thinkific-site.com/api/course_player/v2/courses/your-course-name" -# For selective content downloads, use the JSON file created from Thinki Parser. -# Copy the file to the Thinkifi Downloader root folder. -# Specify the file name below. Ex. COURSE_DATA_FILE="modified-course.json" -COURSE_DATA_FILE="" - # Client date header - Get this from browser Developer Tools Network tab CLIENT_DATE="2025-09-23T07:42:31.512Z" @@ -16,31 +15,69 @@ CLIENT_DATE="2025-09-23T07:42:31.512Z" # IMPORTANT: Keep this secret and never share it! COOKIE_DATA="_thinkific_session=YOUR_SESSION_COOKIE_HERE" +# =============================================== +# BASIC SETTINGS +# =============================================== + # Quality Available: "Original File", "1080p", "720p", "540p", "360p", "224p" # Recommended: "720p" for good quality and reasonable file size VIDEO_DOWNLOAD_QUALITY="720p" -# Set to true to download all available video formats/qualities -# Warning: This will significantly increase download size and time -# ALL_VIDEO_FORMATS=false +# Set download directory (defaults to ./downloads) +# All course content will be downloaded to this directory +OUTPUT_DIR="./downloads" + +# =============================================== +# ENHANCED FEATURES +# =============================================== + +# Number of concurrent downloads (default: 3, recommended: 1-5) +# Higher numbers may trigger rate limiting +CONCURRENT_DOWNLOADS=3 + +# Delay between downloads in seconds (default: 1.0) +# Increase if you encounter rate limiting issues +DOWNLOAD_DELAY=1.0 + +# Number of retry attempts for failed downloads (default: 3) +RETRY_ATTEMPTS=3 + +# Rate limiting in MB/s (default: unlimited) +# Set a value to limit download speed (e.g., RATE_LIMIT_MB_S=5.0) +# RATE_LIMIT_MB_S= + +# File validation after download (default: true) +# Validates file integrity and size +VALIDATE_DOWNLOADS=true + +# Resume partial downloads (default: true) +# Automatically resume interrupted downloads +RESUME_PARTIAL=true + +# Debug mode (default: false) +# Enable detailed logging for troubleshooting +DEBUG=false + +# =============================================== +# ADVANCED SETTINGS +# =============================================== # Set to true to enable ffmpeg presentation merging (requires ffmpeg installed) # This combines multi-part presentations into single video files -# FFMPEG_PRESENTATION_MERGE=false - -# Optional: Set download directory (defaults to ./downloads) -# OUTPUT_DIR="./downloads" +FFMPEG_PRESENTATION_MERGE=false -# Optional: Number of concurrent downloads (default: 2) -# Higher numbers may trigger rate limiting -# CONCURRENT_DOWNLOADS=2 +# =============================================== +# OPTIONAL FEATURES (LEGACY SUPPORT) +# =============================================== -# Optional: Delay between downloads in seconds (default: 1) -# Increase if you encounter rate limiting issues -# DOWNLOAD_DELAY=1 +# For selective content downloads, use the JSON file created from Thinki Parser. +# Copy the file to the Thinkifi Downloader root folder. +# Specify the file name below. Ex. COURSE_DATA_FILE="modified-course.json" +# COURSE_DATA_FILE="" -# Optional: Number of retry attempts for failed downloads (default: 3) -# RETRY_ATTEMPTS=3 +# Set to true to download all available video formats/qualities +# Warning: This will significantly increase download size and time +# ALL_VIDEO_FORMATS=false -# Optional: Log level (DEBUG, INFO, WARNING, ERROR) +# Log level (DEBUG, INFO, WARNING, ERROR) # LOG_LEVEL="INFO" \ No newline at end of file diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..bf14598 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,43 @@ +name: ๐งช CI + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + test: + name: ๐งช Basic Tests + runs-on: ubuntu-latest + + steps: + - name: ๐๏ธ Checkout + uses: actions/checkout@v4 + + - name: ๐ Setup Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - name: ๐ฆ Install dependencies + run: | + pip install -r requirements.txt + + - name: ๐งช Test imports + run: | + python -c "import thinkific_downloader; print('โ Package imports work')" + python -c "from thinkific_downloader.config import Settings; print('โ Config works')" + + docker: + name: ๐ณ Docker Build + runs-on: ubuntu-latest + + steps: + - name: ๐๏ธ Checkout + uses: actions/checkout@v4 + + - name: ๐ณ Build Docker image + run: | + docker build -t thinkific-downloader:test . + echo "โ Docker image builds successfully" \ No newline at end of file diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..3ebe7c0 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,117 @@ +name: ๐ Release + +on: + push: + tags: + - 'v*.*.*' + +jobs: + release: + name: ๐ฆ Create Release + runs-on: ubuntu-latest + + steps: + - name: ๐๏ธ Checkout + uses: actions/checkout@v4 + + - name: ๐ท๏ธ Get version + id: version + run: echo "tag=${GITHUB_REF#refs/tags/}" >> $GITHUB_OUTPUT + + - name: ๐ Create Release + uses: actions/create-release@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + tag_name: ${{ steps.version.outputs.tag }} + release_name: ๐ Release ${{ steps.version.outputs.tag }} + body: | + ## ๐ New Release: ${{ steps.version.outputs.tag }} + + ### ๐ Installation Options + + **Docker Hub:** + ```bash + docker pull kvnxo/thinkific-downloader:${{ steps.version.outputs.tag }} + # or + docker pull kvnxo/thinkific-downloader:latest + ``` + + **GitHub Packages:** + ```bash + docker pull ghcr.io/itskavin/thinkific-downloader:${{ steps.version.outputs.tag }} + # or + docker pull ghcr.io/itskavin/thinkific-downloader:latest + ``` + + **Setup and Run:** + ```bash + git clone https://github.com/itskavin/Thinkific-Downloader.git + cd Thinkific-Downloader + cp .env.example .env + # Edit .env with your details + docker-compose up + ``` + + **Python Direct:** + ```bash + git clone https://github.com/itskavin/Thinkific-Downloader.git + cd Thinkific-Downloader + pip install -r requirements.txt + python thinkificdownloader.py + ``` + + ### ๐ฏ Key Features + - Downloads to `./downloads/` by default + - Docker support for easy setup + - Parallel downloads + - Smart resume functionality + draft: false + prerelease: false + + docker: + name: ๐ณ Build Docker + runs-on: ubuntu-latest + needs: release + if: success() + + steps: + - name: ๐๏ธ Checkout + uses: actions/checkout@v4 + + - name: ๐ท๏ธ Get version + id: version + run: echo "tag=${GITHUB_REF#refs/tags/}" >> $GITHUB_OUTPUT + + - name: ๐ Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + + - name: ๐ Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: ๐ง Build and Push to Docker Hub + run: | + docker build -t kvnxo/thinkific-downloader:latest . + docker build -t kvnxo/thinkific-downloader:${{ steps.version.outputs.tag }} . + docker push kvnxo/thinkific-downloader:latest + docker push kvnxo/thinkific-downloader:${{ steps.version.outputs.tag }} + + - name: ๐ฆ Build and Push to GitHub Packages + run: | + # Convert repository name to lowercase for GitHub Container Registry + REPO_LOWER=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]') + + # Build and tag for GitHub Container Registry + docker build -t ghcr.io/${REPO_LOWER}:latest . + docker build -t ghcr.io/${REPO_LOWER}:${{ steps.version.outputs.tag }} . + + # Push to GitHub Container Registry + docker push ghcr.io/${REPO_LOWER}:latest + docker push ghcr.io/${REPO_LOWER}:${{ steps.version.outputs.tag }} \ No newline at end of file diff --git a/.gitignore b/.gitignore index 385d88f..e3dadea 100644 --- a/.gitignore +++ b/.gitignore @@ -11,7 +11,6 @@ __pycache__/ build/ develop-eggs/ dist/ -downloads/ eggs/ .eggs/ lib/ @@ -93,4 +92,5 @@ ffmpeg.log # Docker runtime artifacts (keep config files in git) .docker/ docker-volumes/ -*.pid \ No newline at end of file +*.pid +thinkific-launch-accelerator-course-october-2025/.download_status.json.bak diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 78e7212..a0cdb5b 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -62,13 +62,15 @@ graph TB - Retry logic with exponential backoff - Resource management -#### **4. File Validator** (`file_validator.py`) -- **Purpose**: Smart file validation and skip logic + +#### **4. Resume Tracker** (`resume_tracker.py`) +- **Purpose**: Atomic, cross-platform resume and status tracking - **Features**: - - File integrity checking (size, checksums) - - Resume detection and validation - - Download metadata persistence - - Smart skip decisions + - Download status tracking and backup (Windows, Mac, Linux) + - File integrity checking (size, checksums) + - Resume detection and validation + - Download metadata persistence + - Smart skip decisions #### **5. Content Processors** - **Wistia Downloader** (`wistia_downloader.py`): Video processing @@ -179,7 +181,7 @@ graph TD 3. **Course Processing**: API calls โ Content parsing โ Task creation 4. **Download Orchestration**: Task queue โ `download_manager.py` โ Parallel workers 5. **Progress Tracking**: Thread-safe updates โ `progress_manager.py` โ Rich UI -6. **Validation**: File checks โ `file_validator.py` โ Skip decisions +6. **Validation**: File checks โ `resume_tracker.py` โ Skip decisions --- @@ -286,7 +288,7 @@ tests/ โโโ unit/ โ โโโ test_progress_manager.py โ โโโ test_download_manager.py -โ โโโ test_file_validator.py +โ โโโ test_resume_tracker.py โ โโโ test_enhanced_downloader.py โโโ integration/ โ โโโ test_full_download.py @@ -339,6 +341,18 @@ class TestProgressManager: # Assert assert file_id in progress_manager.downloads assert download.filename == filename + + def test_resume_tracker_atomic_save(self): + from thinkific_downloader.resume_tracker import ResumeTracker + import tempfile + with tempfile.TemporaryDirectory() as tmpdir: + status_file = Path(tmpdir) / ".download_status.json" + tracker = ResumeTracker(str(status_file)) + tracker.status_data["test"] = {"status": "completed"} + tracker._save_status() + assert status_file.exists() + backup_file = status_file.with_suffix('.json.bak') + assert backup_file.exists() @patch('thinkific_downloader.progress_manager.time.time') def test_calculate_download_speed(self, mock_time, progress_manager): diff --git a/README.md b/README.md index 80785bf..7bf7bf3 100644 --- a/README.md +++ b/README.md @@ -30,13 +30,12 @@ A modern, feature-rich Python utility to download courses from Thinkific platfor - **๐ง Smart File Validation** - Automatic integrity checking and corruption detection - **โถ๏ธ Resume Downloads** - Intelligent partial download recovery and continuation - **โญ๏ธ Skip Existing Files** - Automatic detection and skipping of completed downloads +- **๐พ Atomic Resume/Backup System** - Cross-platform safe status tracking and backup (Windows, Mac, Linux) ### ๐ฏ **Progress Monitoring** -``` -๐พ introduction.mp4 โโโโโโโโโโโโโโโโโโโโโโโโโโโโ 100% 156.2MB โข 12.3MB/s โข Complete -๐ lesson-02.mp4 โโโโโโโโโโโโโโโโโโโโโโโโโโโโ 45% 89.1MB/198.4MB โข 8.7MB/s โข 0:00:12 -โณ lesson-03.pdf โโโโโโโโโโโโโโโโโโโโโโโโโโโโ 0% Queued -``` +#### Example Progress UI + + ### ๐ **Reliability & Safety** - **๐ Exponential Retry Logic** - Smart retry with jitter for failed downloads @@ -69,6 +68,7 @@ A modern, feature-rich Python utility to download courses from Thinkific platfor - **Rich Terminal Interface** - Beautiful progress bars and status updates - **Smart File Organization** - Logical folder structure with clean naming - **Resume Support** - Skip existing files, continue interrupted downloads +- **Atomic Resume/Backup** - Status file is always safely backed up and updated, works on Windows, Mac, Linux - **Multiple Quality Options** - Choose video quality (720p, 1080p, etc.) - **Comprehensive Logging** - Debug mode for troubleshooting @@ -77,28 +77,84 @@ A modern, feature-rich Python utility to download courses from Thinkific platfor - **Session Management** - Proper authentication handling - **Error Recovery** - Graceful handling of network issues - **Validation** - File integrity checks and cleanup +- **Atomic Status File** - Download status is always saved safely, with backup, for reliable resume ## ๐ฏ **Quick Start** +**โ ๏ธ Important**: Always clone or download the project first! The application needs access to the project directory for downloads, configuration files (.env), and proper functionality. + ### **๐ณ Docker (Recommended)** +**Step 1: Get the Project** +```bash +# Clone or download the project +git clone https://github.com/itskavin/Thinkific-Downloader.git +cd Thinkific-Downloader + +# Or download and extract ZIP, then navigate to project directory +``` + +**Step 2: Setup Environment** ```bash +# Create your .env file (see configuration section below) +cp .env.example .env +# Edit .env with your course details +``` + +**Step 3: Run with Docker** +```bash +# Option 1: Docker Hub docker pull kvnxo/thinkific-downloader -docker run -it --rm -v $(pwd)/downloads:/app/downloads kvnxo/thinkific-downloader +docker run -it --rm -v $(pwd)/downloads:/app/downloads --env-file .env kvnxo/thinkific-downloader + +# Option 2: GitHub Packages +docker pull ghcr.io/itskavin/thinkific-downloader +docker run -it --rm -v $(pwd)/downloads:/app/downloads --env-file .env ghcr.io/itskavin/thinkific-downloader + +# Option 3: Docker Compose (recommended) +docker-compose up ``` ### **๐ Python Direct** ```bash +# Step 1: Clone the project git clone https://github.com/itskavin/Thinkific-Downloader.git cd Thinkific-Downloader + +# Step 2: Install dependencies pip install -r requirements.txt -# Update environment variables in .env or export them directly -python thinkidownloader3.py +# Step 3: Configure and run +# Update environment variables in .env file +python thinkificdownloader.py +``` + +### **๐ฆ Source Code Packages** + +Get the latest source code: + +```bash +# Clone the repository +git clone https://github.com/itskavin/Thinkific-Downloader.git +cd Thinkific-Downloader + +# Setup and run with Docker +cp .env.example .env +# Edit .env with your course details +docker-compose up +# Or run with Python +pip install -r requirements.txt +python thinkificdownloader.py ``` +> **Resume/Backup System:** +> - Download status is tracked in `.download_status.json` (atomic, cross-platform) +> - A backup `.download_status.json.bak` is created automatically before each update +> - If interrupted, simply rerun the downloader to resume from where you left off +> - Works seamlessly on Windows, Mac, and Linux + > ๐ **Need detailed setup instructions?** Check out our comprehensive [**SETUP.md**](SETUP.md) guide for step-by-step installation, troubleshooting, and configuration options. > ๐จโ๐ป **Developer?** Visit [**DEVELOPMENT.md**](DEVELOPMENT.md) for architecture overview, API reference, and contribution guidelines. @@ -108,27 +164,36 @@ python thinkidownloader3.py Configure advanced features via environment variables or `.env` file: ```bash -# Required +# =============================================== +# REQUIRED AUTHENTICATION +# =============================================== COURSE_LINK="" # Thinkific course URL COOKIE_DATA="" # Browser cookies for authentication CLIENT_DATE="" # Client date header -# Optional - Performance -VIDEO_DOWNLOAD_QUALITY="Original File" # Video quality (Original File,720p, 1080p, etc.) -CONCURRENT_DOWNLOADS=3 # Number of parallel downloads (1-10 recommended) +# =============================================== +# BASIC SETTINGS +# =============================================== +VIDEO_DOWNLOAD_QUALITY="720p" # Video quality (Original File, 720p, 1080p, etc.) +OUTPUT_DIR="./downloads" # Download directory (defaults to ./downloads) + +# =============================================== +# ENHANCED FEATURES +# =============================================== +CONCURRENT_DOWNLOADS=3 # Number of parallel downloads (1-5 recommended) RETRY_ATTEMPTS=3 # Number of retry attempts for failed downloads -RATE_LIMIT_MB_S=0 # Rate limit in MB/s (0 = unlimited) DOWNLOAD_DELAY=1.0 # Delay between downloads (seconds) +RATE_LIMIT_MB_S= # Rate limit in MB/s (empty = unlimited) -# Optional - Features +# Feature toggles VALIDATE_DOWNLOADS=true # Enable file integrity validation RESUME_PARTIAL=true # Enable resume for partial downloads DEBUG=false # Enable debug logging -# Optional - System -OUTPUT_DIR=./downloads # Download directory +# =============================================== +# ADVANCED SETTINGS +# =============================================== FFMPEG_PRESENTATION_MERGE=false # Enable FFmpeg presentation merging -LOG_LEVEL=INFO # Logging level (DEBUG, INFO, WARNING) ``` ``` @@ -151,24 +216,29 @@ docker-compose up ## ๐ **Output Structure** +**Default Location**: All courses are downloaded to `./downloads/` directory in your project folder. + ``` -๐ Course Name/ -โโโ ๐ 01. Introduction/ -โ โโโ ๐ 01. Welcome Video/ -โ โ โโโ ๐ฅ welcome-video.mp4 -โ โ โโโ ๐ video-info.json -โ โโโ ๐ 02. Course Overview/ -โ โโโ ๐ course-overview.html -โ โโโ ๐ quiz-structure.json -โโโ ๐ 02. Getting Started/ -โ โโโ ๐ 01. Setup Instructions/ -โ โโโ ๐ฅ setup-instructions.mp4 -โ โโโ ๐ setup-guide.pdf -โ โโโ ๐จ presentation-slides.mp4 -โโโ ๐ course-metadata.json -โโโ ๐ download-summary.json +๐ downloads/ +โโโ ๐ Course Name/ + โโโ ๐ 01. Introduction/ + โ โโโ ๐ 01. Welcome Video/ + โ โ โโโ ๐ฅ welcome-video.mp4 + โ โ โโโ ๐ video-info.json + โ โโโ ๐ 02. Course Overview/ + โ โโโ ๐ course-overview.html + โ โโโ ๐ quiz-structure.json + โโโ ๐ 02. Getting Started/ + โ โโโ ๐ 01. Setup Instructions/ + โ โโโ ๐ฅ setup-instructions.mp4 + โ โโโ ๐ setup-guide.pdf + โ โโโ ๐จ presentation-slides.mp4 + โโโ ๐ course-metadata.json + โโโ ๐ download-summary.json ``` +**Customization**: Set `OUTPUT_DIR=./my-custom-path` in your `.env` file to change the download location. + ### **Supported Content Types** @@ -182,7 +252,22 @@ docker-compose up | **Quizzes** | `.json` | Structure export | Question/answer format | ## โ **FAQ** +### **Resume/Backup System** + +**Q: How does resume work?** +- The downloader automatically tracks download status in `.download_status.json`. +- Before updating, a backup `.download_status.json.bak` is created (atomic, safe). +- If interrupted, just rerun the downloader. It will resume partial downloads, skip completed files, and retry failed ones. +- No manual intervention needed. + +**Q: Is it safe on Windows, Mac, Linux?** +- Yes! The resume/backup system uses atomic file operations and works on all major platforms. + +**Q: Where is the status file stored?** +- In the current working directory (where you run the downloader). +**Q: Can I delete the status file?** +- Yes, but you will lose resume progress. The backup file is for safety only. ### **๐ Authentication & Setup** **Q: How do I get the required authentication data?** diff --git a/SETUP.md b/SETUP.md index 244afd3..a1318fc 100644 --- a/SETUP.md +++ b/SETUP.md @@ -36,88 +36,38 @@ This comprehensive guide walks you through installing and configuring Thinkific- ## ๐ Installation Methods -### **Method 1: Docker (Recommended - Easiest)** - -Docker provides the most consistent and hassle-free experience with all dependencies pre-installed. - -#### **1.1 Install Docker** -- **Windows/Mac**: Download [Docker Desktop](https://www.docker.com/products/docker-desktop/) -- **Linux**: Follow [Docker installation guide](https://docs.docker.com/engine/install/) - -#### **1.2 Pull and Run** -```bash -# Pull the latest image -docker pull kvnxo/thinkific-downloader:latest - -# Run with basic setup -docker run -it --rm \ - -v $(pwd)/downloads:/app/downloads \ - -e COURSE_LINK="YOUR_COURSE_URL" \ - -e COOKIE_DATA="YOUR_COOKIES" \ - -e CLIENT_DATE="YOUR_CLIENT_DATE" \ - kvnxo/thinkific-downloader:latest -``` - -#### **1.3 Docker Compose (Recommended)** -Create `docker-compose.yml`: -```yaml -version: '3.8' -services: - thinkific-downloader: - image: kvnxo/thinkific-downloader:latest - volumes: - - ./downloads:/app/downloads - - ./.env:/app/.env - environment: - - COURSE_LINK=${COURSE_LINK} - - COOKIE_DATA=${COOKIE_DATA} - - CLIENT_DATE=${CLIENT_DATE} - # Enhanced features - - CONCURRENT_DOWNLOADS=3 - - RETRY_ATTEMPTS=3 -``` - -Run with: `docker-compose up` +### **๐ฆ Option 1: Clone Repository (Recommended)** + +Get the latest version directly from GitHub: + +1. **Clone the repository**: + ```bash + git clone https://github.com/itskavin/Thinkific-Downloader.git + cd Thinkific-Downloader + ``` + +2. **Setup configuration**: + ```bash + cp .env.example .env + # Edit .env with your course details (see Authentication Setup below) + ``` + +3. **Run with Docker** (Recommended): + ```bash + docker-compose up + ``` + + **Or run with Python**: + ```bash + pip install -r requirements.txt + python thinkificdownloader.py + ``` --- -### **Method 2: Python Package Installation** - -For users who prefer native Python installation with full control. - -#### **2.1 Clone Repository** -```bash -# Clone the repository -git clone https://github.com/itskavin/Thinkific-Downloader.git -cd Thinkific-Downloader -``` - -#### **2.2 Quick Setup (Automated)** -```bash -# Run the automated installer -python install.py -``` - -#### **2.3 Manual Installation** -```bash -# Create virtual environment (recommended) -python -m venv venv -source venv/bin/activate # Linux/Mac -# or -venv\Scripts\activate # Windows - -# Install dependencies -pip install -r requirements.txt +### **๐ณ Option 2: Docker Only** -# Install in development mode -pip install -e . -``` - -#### **2.4 Verify Installation** -```bash -# Test the installation -python -m thinkific_downloader --help -``` +If you want to use Docker without cloning: --- @@ -204,6 +154,10 @@ VALIDATE_DOWNLOADS=true # Resume Partial Downloads (true/false) RESUME_PARTIAL=true +# Atomic Resume/Backup System (always enabled) +# Download status is tracked in .download_status.json (atomic, cross-platform) +# A backup .download_status.json.bak is created automatically before each update + # Debug Mode (true/false) DEBUG=false @@ -280,6 +234,15 @@ docker run -it --rm \ ๐ Course: Your Course Name | Progress: 0.0% (0/25 files) | Speed: 0.0 MB/s | ETA: Unknown +๐ Resume Status Summary + โ 5 files already completed + ๐ฅ 2 files partially downloaded (will resume) + โ 1 files previously failed (will retry) + +๐ Files to download: 31 +๐ Parallel workers: 3 +โก Enhanced features: Rate limiting, Resume, Validation + ๐ฅ introduction.mp4 โโโโโโโโโโโโโโโโโโโโโโโโโโโโ 100% 156.2MB โข 12.3MB/s โข Complete ๐ lesson-02.mp4 โโโโโโโโโโโโโโโโโโโโโโโโโโโโ 45% 89.1MB/198.4MB โข 8.7MB/s โข 0:00:12 โณ lesson-03.pdf โโโโโโโโโโโโโโโโโโโโโโโโโโโโ 0% Queued @@ -535,6 +498,23 @@ After setup, verify everything works: ```bash # Should show course info without downloading python -c "from thinkific_downloader.config import Settings; s=Settings.from_env(); print(f'โ Auth OK for {s.client_date[:20]}...')" + +## Resume/Backup System + +**How does resume work?** +- The downloader automatically tracks download status in `.download_status.json`. +- Before updating, a backup `.download_status.json.bak` is created (atomic, safe). +- If interrupted, just rerun the downloader. It will resume partial downloads, skip completed files, and retry failed ones. +- No manual intervention needed. + +**Is it safe on Windows, Mac, Linux?** +- Yes! The resume/backup system uses atomic file operations and works on all major platforms. + +**Where is the status file stored?** +- In the current working directory (where you run the downloader). + +**Can I delete the status file?** +- Yes, but you will lose resume progress. The backup file is for safety only. ``` ### **2. Test Network Connection** diff --git a/images/image.png b/images/image.png new file mode 100644 index 0000000..37cb897 Binary files /dev/null and b/images/image.png differ diff --git a/requirements.txt b/requirements.txt index cc647e0..cef341a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ tqdm>=4.65.0 +requests>=2.28.0 rich>=13.0.0 \ No newline at end of file diff --git a/setup.py b/setup.py index 15ca50e..6416b43 100644 --- a/setup.py +++ b/setup.py @@ -28,9 +28,16 @@ packages=find_packages(), python_requires=">=3.8", install_requires=[ + "requests>=2.31.0", + "rich>=13.0.0", "tqdm>=4.65.0", + "urllib3>=2.0.0", ], extras_require={ + "enhanced": [ + "beautifulsoup4>=4.12.0", + "lxml>=4.9.0", + ], "brotli": ["brotli>=1.0.9"], }, entry_points={ diff --git a/thinkific_downloader/__init__.py b/thinkific_downloader/__init__.py index 7254b04..ee6f1b4 100644 --- a/thinkific_downloader/__init__.py +++ b/thinkific_downloader/__init__.py @@ -11,7 +11,7 @@ Features: - Modern Python package architecture -- Rich terminal UI with progress tracking +- progress tracking - Smart retry logic and error recovery - Resume support for interrupted downloads - Docker containerization with FFmpeg diff --git a/thinkific_downloader/config.py b/thinkific_downloader/config.py index adf3bb0..5b98829 100644 --- a/thinkific_downloader/config.py +++ b/thinkific_downloader/config.py @@ -27,6 +27,7 @@ class Settings: cookie_data: str video_download_quality: str = '720p' ffmpeg_presentation_merge: bool = False + output_dir: str = './downloads' # Default to downloads directory # Enhanced downloader settings concurrent_downloads: int = 3 retry_attempts: int = 3 @@ -40,23 +41,37 @@ class Settings: @classmethod def from_env(cls): load_env() + + # Required authentication client_date = os.getenv('CLIENT_DATE', '') cookie_data = os.getenv('COOKIE_DATA', '') + + # Basic settings with matching defaults to .env.example video_download_quality = os.getenv('VIDEO_DOWNLOAD_QUALITY', '720p') + output_dir = os.getenv('OUTPUT_DIR', './downloads') + + # Advanced settings ffmpeg_flag_raw = os.getenv('FFMPEG_PRESENTATION_MERGE', 'false').lower() ffmpeg_merge = ffmpeg_flag_raw in ('1', 'true', 'yes', 'on') - # Enhanced settings + # Enhanced downloader settings with matching defaults concurrent_downloads = int(os.getenv('CONCURRENT_DOWNLOADS', '3')) retry_attempts = int(os.getenv('RETRY_ATTEMPTS', '3')) - rate_limit_mb_s = float(os.getenv('RATE_LIMIT_MB_S', '0')) or None download_delay = float(os.getenv('DOWNLOAD_DELAY', '1.0')) + + # Rate limiting - empty string or 0 means unlimited + rate_limit_env = os.getenv('RATE_LIMIT_MB_S', '') + rate_limit_mb_s = float(rate_limit_env) if rate_limit_env and rate_limit_env != '0' else None + + # Feature toggles validate_downloads = os.getenv('VALIDATE_DOWNLOADS', 'true').lower() in ('1', 'true', 'yes', 'on') resume_partial = os.getenv('RESUME_PARTIAL', 'true').lower() in ('1', 'true', 'yes', 'on') debug = os.getenv('DEBUG', 'false').lower() in ('1', 'true', 'yes', 'on') + # Validation if not client_date or not cookie_data: raise SystemExit('Cookie data and Client Date not set. Use the ReadMe file first before using this script.') + # Basic directory permissions check cwd = Path.cwd() if not os.access(cwd, os.W_OK): @@ -64,7 +79,8 @@ def from_env(cls): return cls( client_date=client_date, cookie_data=cookie_data, - video_download_quality=video_download_quality, + video_download_quality=video_download_quality, + output_dir=output_dir, ffmpeg_presentation_merge=ffmpeg_merge, concurrent_downloads=concurrent_downloads, retry_attempts=retry_attempts, diff --git a/thinkific_downloader/download_manager.py b/thinkific_downloader/download_manager.py new file mode 100644 index 0000000..3ea0b9c --- /dev/null +++ b/thinkific_downloader/download_manager.py @@ -0,0 +1,542 @@ +import os +import time +import hashlib +import threading +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path +from typing import Optional, Dict, Any, Callable, List +from urllib.parse import urlparse +import requests +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry +from rich.progress import Progress, TaskID, TextColumn, BarColumn, TimeRemainingColumn, TransferSpeedColumn, DownloadColumn +from rich.text import Text +from rich.progress import ProgressColumn + +class QueuedSpeedColumn(ProgressColumn): + """Speed column that shows 'Queued' instead of unrealistic speeds""" + def render(self, task): + # Try to get Rich's calculated speed + try: + # Rich Progress stores speed in task.speed as bytes per second + speed = task.speed + except: + speed = None + + if speed is None or speed <= 0: + return Text("Queued", style="dim") + + # Convert bytes/sec to readable format + if speed >= 1024 * 1024: # >= 1 MB/s + speed_display = speed / (1024 * 1024) + return Text(f"{speed_display:.1f} MB/s", style="green") + elif speed >= 1024: # >= 1 KB/s + speed_display = speed / 1024 + return Text(f"{speed_display:.1f} KB/s", style="green") + else: + return Text(f"{speed:.0f} B/s", style="green") + +class QueuedTimeColumn(ProgressColumn): + """Time remaining column that shows 'Queued' for pending downloads""" + def render(self, task): + try: + # Get Rich's calculated time remaining + time_remaining = task.time_remaining + except: + time_remaining = None + + if time_remaining is None or time_remaining <= 0: + return Text("Queued", style="dim") + + # Handle very long estimates (likely unrealistic) + if time_remaining > 86400: # More than 24 hours + return Text("Long time", style="yellow") + + remaining = int(time_remaining) + hours = remaining // 3600 + minutes = (remaining % 3600) // 60 + seconds = remaining % 60 + + if hours > 0: + return Text(f"{hours:02d}:{minutes:02d}:{seconds:02d}", style="cyan") + else: + return Text(f"{minutes:02d}:{seconds:02d}", style="cyan") +from rich.console import Console +from .config import Settings +from .file_utils import filter_filename + + +class RateLimiter: + """Token bucket rate limiter for controlling download speed.""" + + def __init__(self, rate_limit_mb_s: Optional[float] = None): + self.rate_limit_bytes_s = rate_limit_mb_s * 1024 * 1024 if rate_limit_mb_s else None + self.tokens = 0.0 + self.last_update = time.time() + self.lock = threading.Lock() + + def acquire(self, size: int) -> float: + """Acquire tokens for the given size. Returns sleep time if rate limited.""" + if not self.rate_limit_bytes_s: + return 0.0 + + with self.lock: + now = time.time() + time_passed = now - self.last_update + self.tokens += time_passed * self.rate_limit_bytes_s + self.tokens = min(self.tokens, self.rate_limit_bytes_s) # Cap at burst rate + self.last_update = now + + if self.tokens >= size: + self.tokens -= size + return 0.0 + else: + # Calculate wait time + wait_time = (size - self.tokens) / self.rate_limit_bytes_s + self.tokens = 0.0 + self.last_update = now + wait_time + return wait_time + + +class DownloadSession: + """Manages HTTP sessions with connection pooling and retry logic.""" + + def __init__(self, settings: Settings): + self.settings = settings + self.session = self._create_session() + + def _create_session(self) -> requests.Session: + """Create a requests session with proper configuration.""" + session = requests.Session() + + # Configure retry strategy + retry_strategy = Retry( + total=self.settings.retry_attempts, + backoff_factor=1, + status_forcelist=[429, 500, 502, 503, 504], + ) + + # Create adapter with connection pooling + adapter = HTTPAdapter( + max_retries=retry_strategy, + pool_connections=10, + pool_maxsize=20 + ) + + session.mount("http://", adapter) + session.mount("https://", adapter) + + # Set default headers + session.headers.update({ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Accept': 'application/json,text/javascript,*/*;q=0.9', + 'Accept-Encoding': 'gzip, deflate, br', + 'Sec-Fetch-Mode': 'cors', + 'Sec-Fetch-Site': 'cross-site', + 'x-requested-with': 'XMLHttpRequest', + 'x-thinkific-client-date': self.settings.client_date, + 'cookie': self.settings.cookie_data, + }) + + return session + + def get(self, url: str, **kwargs) -> requests.Response: + """Make a GET request with the session.""" + return self.session.get(url, timeout=60, **kwargs) + + def close(self): + """Close the session.""" + self.session.close() + + +class FileValidator: + """Handles file validation and integrity checks.""" + + @staticmethod + def calculate_checksum(file_path: Path, algorithm: str = 'md5') -> str: + """Calculate file checksum.""" + hash_func = hashlib.new(algorithm) + with open(file_path, 'rb') as f: + for chunk in iter(lambda: f.read(8192), b""): + hash_func.update(chunk) + return hash_func.hexdigest() + + @staticmethod + def validate_file_size(file_path: Path, expected_size: Optional[int] = None) -> bool: + """Validate file size if expected size is provided.""" + if expected_size is None: + return True + return file_path.stat().st_size == expected_size + + @staticmethod + def is_file_complete(file_path: Path, expected_size: Optional[int] = None) -> bool: + """Check if file appears to be complete.""" + if not file_path.exists(): + return False + if expected_size is None: + return True + return file_path.stat().st_size == expected_size + + +class DownloadTask: + """Represents a single download task.""" + + def __init__(self, url: str, dest_path: Path, expected_size: Optional[int] = None, + checksum: Optional[str] = None, resume: bool = True): + self.url = url + self.dest_path = dest_path + self.expected_size = expected_size + self.checksum = checksum + self.resume = resume + self.downloaded_size = 0 + self.status = 'pending' + self.error: Optional[str] = None + + def is_complete(self) -> bool: + """Check if download is complete.""" + if not self.dest_path.exists(): + return False + if self.expected_size: + return self.dest_path.stat().st_size == self.expected_size + return True + + +class DownloadManager: + """Manages parallel downloads with rate limiting, retries, and validation.""" + + def __init__(self, settings: Settings): + self.settings = settings + self.session = DownloadSession(settings) + self.rate_limiter = RateLimiter(settings.rate_limit_mb_s) + self.executor = ThreadPoolExecutor(max_workers=settings.concurrent_downloads) + self.validator = FileValidator() + self.active_downloads: Dict[str, DownloadTask] = {} + self.lock = threading.Lock() + + def download_file(self, url: str, dest_path: Path, expected_size: Optional[int] = None, + checksum: Optional[str] = None, show_progress: bool = True) -> bool: + """Download a single file with all features enabled.""" + task = DownloadTask(url, dest_path, expected_size, checksum, self.settings.resume_partial) + + # Check if file already exists and is valid + if task.is_complete() and self._validate_download(task): + if self.settings.debug: + print(f"File already exists and valid: {dest_path}") + return True + + # Start download + return self._download_single_file(task, show_progress) + + def download_files_parallel(self, tasks: List[DownloadTask], + progress_callback: Optional[Callable] = None) -> List[bool]: + """Download multiple files in parallel with Rich progress display.""" + + console = Console() + + # Create rich progress display + progress = Progress( + TextColumn("[bold blue]{task.fields[filename]}", justify="right"), + BarColumn(bar_width=40), + "[progress.percentage]{task.percentage:>3.1f}%", + "โข", + DownloadColumn(), + "โข", + TransferSpeedColumn(), + "โข", + TimeRemainingColumn(), + console=console, + ) + + with progress: + futures = [] + results = [] + task_progress_map = {} + + for task in tasks: + # Check if already complete + if task.is_complete() and self._validate_download(task): + results.append(True) + continue + + # Get expected size + if task.expected_size is None: + task.expected_size = self._get_content_length(task.url) + + # Add progress task + progress_task_id = progress.add_task( + "download", + filename=task.dest_path.name, + total=task.expected_size or 100 + ) + task_progress_map[id(task)] = progress_task_id + + # Submit download job + future = self.executor.submit(self._download_with_rich_progress, task, progress, progress_task_id) + futures.append((future, task, progress_task_id)) + + # Wait for completion + for future, task, progress_task_id in futures: + try: + result = future.result() + results.append(result) + if progress_callback: + progress_callback(task, result) + except Exception as e: + console.print(f"[red]Download failed for {task.dest_path}: {e}[/red]") + results.append(False) + + return results + + def _download_with_rich_progress(self, task: DownloadTask, progress, progress_task_id: int) -> bool: + """Download a single file with Rich progress bar updates.""" + try: + # Check for resume + resume_pos = 0 + if task.resume and task.dest_path.exists(): + resume_pos = task.dest_path.stat().st_size + if task.expected_size and resume_pos >= task.expected_size: + return self._validate_download(task) + + # Prepare headers for resume + headers = {} + if task.resume and resume_pos > 0: + headers['Range'] = f'bytes={resume_pos}-' + + response = self.session.session.get(task.url, headers=headers, stream=True) + response.raise_for_status() + + # Update progress bar with actual content length + content_length = response.headers.get('Content-Length') + if content_length: + total_size = int(content_length) + resume_pos + if task.expected_size != total_size: + task.expected_size = total_size + progress.update(progress_task_id, total=total_size) + + mode = 'ab' if resume_pos > 0 else 'wb' + downloaded = resume_pos + + with open(task.dest_path, mode) as f: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + # Rate limiting + sleep_time = self.rate_limiter.acquire(len(chunk)) + if sleep_time > 0: + time.sleep(sleep_time) + + f.write(chunk) + downloaded += len(chunk) + + # Update Rich progress bar + progress.update(progress_task_id, advance=len(chunk)) + + # Download completed successfully + task.status = 'completed' + return self._validate_download(task) + + except Exception as e: + task.status = 'failed' + task.error = str(e) + + # Clean up partial file if not resuming + if not task.resume and task.dest_path.exists(): + task.dest_path.unlink() + + return False + + def _download_single_file(self, task: DownloadTask, show_progress: bool = True) -> bool: + """Download a single file with resume support.""" + try: + # Get file size first + if task.expected_size is None: + task.expected_size = self._get_content_length(task.url) + + # Check for resume + resume_pos = 0 + if task.resume and task.dest_path.exists(): + resume_pos = task.dest_path.stat().st_size + if task.expected_size and resume_pos >= task.expected_size: + return self._validate_download(task) + + # Prepare headers for resume + headers = {} + if task.resume and resume_pos > 0: + headers['Range'] = f'bytes={resume_pos}-' + + # Make request + response = self.session.get(task.url, headers=headers, stream=True) + response.raise_for_status() + + # Handle redirect + if response.status_code == 302: + redirect_url = response.headers.get('Location') + if redirect_url: + response = self.session.get(redirect_url, headers=headers, stream=True) + response.raise_for_status() + + # Get actual content length + content_length = response.headers.get('Content-Length') + if content_length: + total_size = int(content_length) + resume_pos + if task.expected_size is None: + task.expected_size = total_size + + mode = 'ab' if resume_pos > 0 else 'wb' + downloaded = resume_pos + + # Progress bar + if show_progress and task.expected_size: + console = Console() + console.print(f"[blue]Downloading {task.dest_path.name}...[/blue]") + + with open(task.dest_path, mode) as f: + start_time = time.time() + for chunk in response.iter_content(chunk_size=8192): + if chunk: + # Rate limiting + sleep_time = self.rate_limiter.acquire(len(chunk)) + if sleep_time > 0: + time.sleep(sleep_time) + + f.write(chunk) + downloaded += len(chunk) + + # Only show speed updates if not in parallel mode (to avoid spam) + if downloaded % (1024 * 1024) == 0 and show_progress: # Update every 1MB + elapsed = time.time() - start_time + if elapsed > 0: + speed = downloaded / elapsed + # Only print speed updates when show_progress is True (not in parallel mode) + pass # Remove speed updates in parallel mode + + # Download completed successfully + + # Validate download + task.status = 'completed' + return self._validate_download(task) + + except Exception as e: + task.status = 'failed' + task.error = str(e) + print(f"Download failed for {task.dest_path}: {e}") + + # Clean up partial file if not resuming + if not task.resume and task.dest_path.exists(): + task.dest_path.unlink() + + return False + + def _get_content_length(self, url: str) -> Optional[int]: + """Get content length from HEAD request.""" + try: + response = self.session.session.head(url, timeout=30) + response.raise_for_status() + content_length = response.headers.get('Content-Length') + return int(content_length) if content_length else None + except: + return None + + def _validate_download(self, task: DownloadTask) -> bool: + """Validate downloaded file with comprehensive checks.""" + if not task.dest_path.exists(): + print(f"โ File missing: {task.dest_path.name}") + return False + + try: + file_size = task.dest_path.stat().st_size + + # Check if file is empty or too small + if file_size == 0: + print(f"โ Empty file detected: {task.dest_path.name}") + task.dest_path.unlink() # Remove empty file + return False + + # For video/audio files, check if they're complete and valid + if task.dest_path.suffix.lower() in ['.mp4', '.mp3', '.wav', '.m4a']: + if not self._validate_media_file(task.dest_path, file_size): + return False + + # Check expected size if available + if task.expected_size and task.expected_size > 0: + size_ratio = file_size / task.expected_size + + # File should be at least 90% of expected size + if size_ratio < 0.9: + print(f"โ Incomplete download: {task.dest_path.name} ({file_size:,} bytes, expected {task.expected_size:,})") + return False + + # File shouldn't be more than 110% of expected size (accounting for small variations) + if size_ratio > 1.1: + print(f"โ ๏ธ File larger than expected: {task.dest_path.name} ({file_size:,} bytes, expected {task.expected_size:,})") + # Don't fail for this case, might be normal + + # Additional validation for specific file types + if not self._validate_file_integrity(task.dest_path): + return False + + print(f"โ Validated: {task.dest_path.name} ({file_size:,} bytes)") + return True + + except Exception as e: + print(f"โ Validation error for {task.dest_path.name}: {e}") + return False + + def _validate_media_file(self, file_path: Path, file_size: int) -> bool: + """Validate media files (MP4, MP3, etc.) for corruption.""" + try: + # Check for minimum file size (media files should be at least a few KB) + if file_size < 1024: # Less than 1KB is suspicious for media + print(f"โ Media file too small: {file_path.name} ({file_size} bytes)") + file_path.unlink() # Remove corrupted file + return False + + # Read first and last few bytes to check file structure + with open(file_path, 'rb') as f: + # Check beginning of file for media headers + header = f.read(16) + + # MP4 files should start with specific signatures + if file_path.suffix.lower() == '.mp4': + # Check for common MP4 signatures + if not (b'ftyp' in header or b'mdat' in header[:8]): + print(f"โ Invalid MP4 header: {file_path.name}") + file_path.unlink() + return False + + # Check if we can read the end of file (indicates complete download) + try: + f.seek(-min(1024, file_size), 2) # Go to last 1KB or file size + f.read(1024) + except: + print(f"โ Cannot read end of file: {file_path.name}") + file_path.unlink() + return False + + return True + + except Exception as e: + print(f"โ Media validation failed for {file_path.name}: {e}") + if file_path.exists(): + file_path.unlink() # Remove corrupted file + return False + + def _validate_file_integrity(self, file_path: Path) -> bool: + """Basic file integrity checks.""" + try: + # Try to read the file completely + with open(file_path, 'rb') as f: + chunk_size = 8192 + while chunk := f.read(chunk_size): + pass # Just reading to ensure file is accessible + return True + + except Exception as e: + print(f"โ File integrity check failed for {file_path.name}: {e}") + if file_path.exists(): + file_path.unlink() # Remove corrupted file + return False + + def close(self): + """Clean up resources.""" + self.session.close() + self.executor.shutdown(wait=True) \ No newline at end of file diff --git a/thinkific_downloader/downloader.py b/thinkific_downloader/downloader.py index 0ab5711..5a1175f 100644 --- a/thinkific_downloader/downloader.py +++ b/thinkific_downloader/downloader.py @@ -11,6 +11,8 @@ from .config import Settings, load_env from .file_utils import filter_filename, unicode_decode +from .download_manager import DownloadManager, DownloadTask +from .progress_manager import print_banner, print_download_start_banner, print_completion_summary, ContentProcessor from tqdm import tqdm # Globals to mirror PHP behavior @@ -18,17 +20,27 @@ COURSE_CONTENTS: List[Dict[str, Any]] = [] SETTINGS: Optional[Settings] = None BASE_HOST: Optional[str] = None +DOWNLOAD_MANAGER: Optional[DownloadManager] = None +DOWNLOAD_TASKS: List[Dict[str, Any]] = [] # Collect all download tasks for parallel execution +CONTENT_PROCESSOR: Optional[ContentProcessor] = None USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36' def init_settings(): - global SETTINGS + global SETTINGS, DOWNLOAD_MANAGER, CONTENT_PROCESSOR if SETTINGS is None: SETTINGS = Settings.from_env() + DOWNLOAD_MANAGER = DownloadManager(SETTINGS) + CONTENT_PROCESSOR = ContentProcessor() def http_get(url: str, headers: Optional[Dict[str, str]] = None, timeout: int = 60) -> str: + import time + import urllib.request + import urllib.error + import gzip + init_settings() if SETTINGS is None: raise RuntimeError("Settings not initialized") @@ -43,13 +55,28 @@ def http_get(url: str, headers: Optional[Dict[str, str]] = None, timeout: int = } if headers: request_headers.update(headers) - req = urllib.request.Request(url, headers=request_headers) - with urllib.request.urlopen(req, timeout=timeout) as resp: - data = resp.read() - encoding = resp.headers.get('Content-Encoding', '') - if 'gzip' in encoding: - data = gzip.decompress(data) - return data.decode('utf-8', errors='replace') + + # Retry logic for network reliability + for attempt in range(3): + try: + req = urllib.request.Request(url, headers=request_headers) + with urllib.request.urlopen(req, timeout=15) as resp: + data = resp.read() + encoding = resp.headers.get('Content-Encoding', '') + if 'gzip' in encoding: + data = gzip.decompress(data) + return data.decode('utf-8', errors='replace') + + except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e: + if attempt < 2: # Not last attempt + print(f" โ ๏ธ Network timeout, retrying... (attempt {attempt + 1}/3)") + time.sleep(2) + continue + else: + raise e + + # Should never reach here, but just in case + raise RuntimeError("All retry attempts failed") def download_file_redirect(url: str, file_name: Optional[str] = None): @@ -85,100 +112,613 @@ def download_file_redirect(url: str, file_name: Optional[str] = None): download_file_chunked(final_url, fname) -def download_file_chunked(src_url: str, dst_name: str, chunk_mb: int = 1): - if Path(dst_name).exists(): - return - init_settings() - if SETTINGS is None: - raise RuntimeError("Settings not initialized") - request_headers = { - 'Accept-Encoding': 'identity', # streaming - 'Sec-Fetch-Mode': 'cors', - 'Sec-Fetch-Site': 'cross-site', - 'x-requested-with': 'XMLHttpRequest', - 'x-thinkific-client-date': SETTINGS.client_date, - 'cookie': SETTINGS.cookie_data, - 'User-Agent': USER_AGENT, - } - req = urllib.request.Request(src_url, headers=request_headers) +def add_download_task(url: str, dest_path: Path, content_type: str = "file"): + """Add a download task to the global download queue.""" + global DOWNLOAD_TASKS + if DOWNLOAD_TASKS is None: + DOWNLOAD_TASKS = [] + + # Check if file exists and validate it + should_download = True + if dest_path.exists(): + file_size = dest_path.stat().st_size + + # Always re-download empty or suspiciously small files + if file_size == 0: + print(f"๐ Re-downloading empty file: {dest_path.name}") + dest_path.unlink() + should_download = True + elif content_type in ['video', 'audio'] and file_size < 1024: + print(f"๐ Re-downloading corrupt media file: {dest_path.name}") + dest_path.unlink() + should_download = True + elif _validate_existing_file(dest_path, content_type): + print(f"โ File already complete: {dest_path.name}") + should_download = False + else: + print(f"๐ Re-downloading invalid file: {dest_path.name}") + dest_path.unlink() + should_download = True + if should_download: + DOWNLOAD_TASKS.append({ + 'url': url, + 'dest_path': dest_path, + 'content_type': content_type + }) + + +def _validate_existing_file(file_path: Path, content_type: str) -> bool: + """Validate an existing file to determine if re-download is needed.""" try: - with urllib.request.urlopen(req) as resp: - # Get file size for progress bar - content_length = resp.headers.get('Content-Length') - total_size = int(content_length) if content_length else None - - chunk_bytes = chunk_mb * 1024 * 1024 - - # Create progress bar - with tqdm( - total=total_size, - unit='B', - unit_scale=True, - unit_divisor=1024, - desc=f"Downloading {Path(dst_name).name}", - bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]' - ) as pbar: - - with open(dst_name, 'wb') as out: - start_time = time.time() - downloaded = 0 - - while True: - chunk = resp.read(chunk_bytes) - if not chunk: - break - out.write(chunk) - downloaded += len(chunk) - - # Update progress bar - pbar.update(len(chunk)) - - # Calculate and display speed every few chunks - if downloaded % (chunk_bytes * 5) == 0: # Update speed every 5MB - elapsed = time.time() - start_time - if elapsed > 0: - speed = downloaded / elapsed - pbar.set_postfix({'speed': f'{speed/1024/1024:.2f} MB/s'}) - - # Final speed calculation - elapsed = time.time() - start_time - if elapsed > 0: - speed = downloaded / elapsed - print(f"Download completed: {downloaded/1024/1024:.2f} MB in {elapsed:.2f}s (avg: {speed/1024/1024:.2f} MB/s)") + file_size = file_path.stat().st_size + + # Empty files are always invalid + if file_size == 0: + return False + + # Media files need special validation + if content_type in ['video', 'audio'] and file_path.suffix.lower() in ['.mp4', '.mp3', '.wav', '.m4a']: + return _validate_media_file_basic(file_path, file_size) + + # For other files, just check if they're readable + try: + with open(file_path, 'rb') as f: + f.read(1024) # Try to read first 1KB + return True + except: + return False + + except Exception: + return False + + +def _validate_media_file_basic(file_path: Path, file_size: int) -> bool: + """Basic validation for media files.""" + try: + # Too small files are invalid + if file_size < 1024: + return False + + # Check file headers + with open(file_path, 'rb') as f: + header = f.read(16) + + # MP4 validation + if file_path.suffix.lower() == '.mp4': + if not (b'ftyp' in header or b'mdat' in header[:8]): + return False + + # Check if we can read the end (complete file) + try: + f.seek(-min(512, file_size), 2) + f.read(512) + except: + return False + + return True + + except Exception: + return False + + +def execute_parallel_downloads() -> int: + """Execute all queued downloads in parallel and return success count.""" + global DOWNLOAD_TASKS, DOWNLOAD_MANAGER - except Exception as e: - print(f"Download failed for {dst_name}: {e}") - # Clean up partial file - if Path(dst_name).exists(): - Path(dst_name).unlink() + if not DOWNLOAD_TASKS or not DOWNLOAD_MANAGER: + return 0 + + from .download_manager import DownloadTask + + # Convert to DownloadTask objects + tasks = [] + for task_data in DOWNLOAD_TASKS: + task = DownloadTask( + url=task_data['url'], + dest_path=task_data['dest_path'] + ) + tasks.append(task) + + # Execute downloads in parallel + results = DOWNLOAD_MANAGER.download_files_parallel(tasks) + + # Count successful downloads + success_count = sum(1 for result in results if result) + return success_count + + +def download_file_chunked(src_url: str, dst_name: str, chunk_mb: int = 1): + """Queue file for parallel download instead of downloading immediately.""" + global DOWNLOAD_TASKS + dst_path = Path(dst_name) + + # Skip if file already exists + if dst_path.exists(): + return + + # Add to download queue instead of downloading immediately + add_download_task(src_url, dst_path, "file") + def init_course(data: Dict[str, Any]): - global COURSE_CONTENTS, ROOT_PROJECT_DIR, BASE_HOST + """Initialize course structure and collect ALL download tasks first.""" + global COURSE_CONTENTS, ROOT_PROJECT_DIR, BASE_HOST, DOWNLOAD_TASKS + + # Initialize download tasks list + DOWNLOAD_TASKS = [] + course_name = filter_filename(data['course']['name']) prev_dir = Path.cwd() ROOT_PROJECT_DIR = prev_dir - course_dir = Path(course_name) + + # Use output_dir from settings, create it if it doesn't exist + output_dir = Path(SETTINGS.output_dir if SETTINGS else './downloads') + output_dir.mkdir(exist_ok=True, parents=True) + + # Create course directory inside the output directory + course_dir = output_dir / course_name course_dir.mkdir(exist_ok=True) os.chdir(course_dir) COURSE_CONTENTS = data['contents'] + + # Check for resume capability + cache_file = Path('.thinkific_progress.json') + analyzed_chapters = set() + saved_tasks = [] + + if cache_file.exists(): + try: + import json + with open(cache_file, 'r', encoding='utf-8') as f: + cache_data = json.load(f) + analyzed_chapters = set(cache_data.get('analyzed_chapters', [])) + saved_tasks = cache_data.get('download_tasks', []) + print(f"๐ Found previous progress: {len(analyzed_chapters)} chapters analyzed, {len(saved_tasks)} tasks cached") + except: + analyzed_chapters = set() + saved_tasks = [] + # Derive base host from landing_page_url if available landing = data['course'].get('landing_page_url') if landing: BASE_HOST = urlparse(landing).hostname - create_chap_folders(data) + + # Phase 1: Create all folders and collect ALL download links + print("\n๐ Phase 1: Analyzing course content and collecting download links...") + + # Restore saved download tasks + if saved_tasks: + print(f"๐ฅ Restoring {len(saved_tasks)} previously collected download tasks...") + for task_data in saved_tasks: + add_download_task(task_data['url'], Path(task_data['dest_path']), task_data.get('content_type', 'video')) + + collect_all_download_tasks(data, analyzed_chapters, cache_file) + + # Phase 2: Execute ALL downloads together + if DOWNLOAD_TASKS: + from .progress_manager import print_download_start_banner + + print(f"\n๐ Phase 2: Starting parallel download of {len(DOWNLOAD_TASKS)} files...") + + # Initialize download manager + init_settings() + parallel_workers = SETTINGS.concurrent_downloads if SETTINGS else 3 + print_download_start_banner(len(DOWNLOAD_TASKS), parallel_workers) + + if DOWNLOAD_MANAGER: + import time + start_time = time.time() + success_count = execute_parallel_downloads() + total_time = time.time() - start_time + + if success_count is not None: + from .progress_manager import print_completion_summary + print_completion_summary(success_count, len(DOWNLOAD_TASKS), total_time) + else: + print(f"[INFO] Download process completed in {total_time:.2f}s") + else: + print("[ERROR] Download manager not initialized") + else: + print("[INFO] No files found for download") + os.chdir(prev_dir) -def create_chap_folders(data: Dict[str, Any]): +def collect_all_download_tasks(data: Dict[str, Any], analyzed_chapters = None, cache_file = None): + """Collect ALL download tasks for the entire course without downloading anything.""" + global DOWNLOAD_TASKS + + if analyzed_chapters is None: + analyzed_chapters = set() + + import json + for i, chapter in enumerate(data.get('chapters', []), start=1): + chapter_id = f"chapter_{i}" + + # Skip if already analyzed (for resume) + if chapter_id in analyzed_chapters: + print(f"โญ๏ธ Skipping Chapter {i}: {chapter['name']} (already analyzed)") + continue + chap_folder_name = f"{i}. {filter_filename(chapter['name'])}" - Path(chap_folder_name).mkdir(exist_ok=True) - prev_dir = Path.cwd() - os.chdir(chap_folder_name) - chapterwise_download(chapter['content_ids']) - os.chdir(prev_dir) + chapter_path = Path(chap_folder_name) + chapter_path.mkdir(exist_ok=True) + + print(f"๐ Analyzing Chapter {i}: {chapter['name']}") + + # Collect download tasks for this chapter + collect_chapter_tasks(chapter['content_ids'], chapter_path) + + # Mark as analyzed and save progress + analyzed_chapters.add(chapter_id) + if cache_file: + try: + # Save current download tasks for resume + task_data = [] + for task in DOWNLOAD_TASKS: + task_data.append({ + 'url': task['url'], + 'dest_path': str(task['dest_path']), + 'content_type': task.get('content_type', 'video') + }) + + with open(cache_file, 'w', encoding='utf-8') as f: + json.dump({ + 'analyzed_chapters': list(analyzed_chapters), + 'download_tasks': task_data + }, f, indent=2) + except Exception as e: + print(f" โ ๏ธ Could not save progress: {e}") + pass # Continue even if cache save fails + + +def collect_chapter_tasks(content_ids: Iterable[Any], chapter_path: Path): + """Collect download tasks for a specific chapter.""" + from .wistia_downloader import video_downloader_wistia, video_downloader_videoproxy + global COURSE_CONTENTS, SETTINGS, DOWNLOAD_TASKS + + index = 1 + for content_id in content_ids: + match = next((c for c in COURSE_CONTENTS if c['id'] == content_id), None) + if not match: + print(f" โ ๏ธ No content found for id {content_id}") + index += 1 + continue + + ctype = match.get('contentable_type') or match.get('default_lesson_type_label') + print(f" ๐ Found {ctype}: {match.get('name')}") + + # HTML Item (Notes) - Collect download tasks + if ctype == 'HtmlItem': + fname = filter_filename(f"{match['slug']}.html") + dc = chapter_path / filter_filename(f"{index}. {match['name']} Text") + dc.mkdir(exist_ok=True) + + if not (dc / fname).exists(): + j = api_get(f"/api/course_player/v2/html_items/{match['contentable']}") + if j: + html_text = j.get('html_item', {}).get('html_text', '') + decoded = unicode_decode(html_text) + + # Collect MP3 audio files + mp3_matches = MP3_PATTERN.findall(decoded) + if mp3_matches: + for audio_url in set(mp3_matches): + audio_name = filter_filename(Path(urlparse(audio_url).path).name) + add_download_task(audio_url, dc / audio_name, "audio") + + # Save HTML content to file + fname = fname.replace(" ", "-") + (dc / fname).write_text(decoded, encoding='utf-8', errors='replace') + + # Collect video download tasks + videoproxy_matches = VIDEOPROXY_PATTERN.findall(decoded) + if videoproxy_matches: + for video_url in set(videoproxy_matches): + collect_video_task_videoproxy(video_url, filter_filename(match['name']), dc) + + wistia_matches = WISTIA_PATTERN.findall(decoded) + if wistia_matches: + for wistia_id in set(wistia_matches): + collect_video_task_wistia(wistia_id, filter_filename(match['name']), dc) + + index += 1 + continue + + # Multimedia (iframe) - Collect download tasks + if match.get('default_lesson_type_label') == 'Multimedia': + dc = chapter_path / filter_filename(f"{index}. {match['name']} Multimedia") + dc.mkdir(exist_ok=True) + + j = api_get(f"/api/course_player/v2/iframes/{match['contentable']}") + file_contents = '' + if j: + src_url = unicode_decode(j.get('iframe', {}).get('source_url') or '') + if re.search(r"(\.md|\.html|/)$", src_url): + try: + file_contents = http_get(src_url) + except Exception: + file_contents = src_url + else: + file_contents = src_url + + # Collect attached files + if j.get('download_files'): + for download_file in j['download_files']: + download_file_name = filter_filename(download_file.get('label') or 'file') + download_file_url = download_file.get('download_url') + if download_file_url: + add_download_task(download_file_url, dc / download_file_name, "file") + + # Save HTML file + fname = f"{match['name']}.html" + fname = re.sub(r"[^A-Za-z0-9\_\-\. \?]", '', fname) + fname = filter_filename(fname) + (dc / fname).write_text(file_contents, encoding='utf-8', errors='replace') + + index += 1 + continue + + # Lesson (videos + html + attachments) - Collect download tasks + if ctype == 'Lesson': + dc = chapter_path / filter_filename(f"{index}. {match['name']} Lesson") + dc.mkdir(exist_ok=True) + vname = filter_filename(match['name']) + + j = api_get(f"/api/course_player/v2/lessons/{match['contentable']}") + if j: + # Collect video download tasks + videos = j.get('videos') or [] + if videos: + for video in videos: + storage = video.get('storage_location') + identifier = video.get('identifier') + if storage == 'wistia' and identifier: + collect_video_task_wistia(identifier, vname, dc) + elif storage == 'videoproxy' and identifier: + collect_video_task_videoproxy(f"https://platform.thinkific.com/videoproxy/v1/play/{identifier}", vname, dc) + else: + direct = video.get('url') + if direct: + add_download_task(direct, dc / f"{vname}.mp4", "video") + + # Save lesson HTML content + lesson_info = j.get('lesson', {}) + html_text = lesson_info.get('html_text') if isinstance(lesson_info, dict) else None + if html_text and html_text.strip(): + html_filename = f"{vname}.html" + (dc / html_filename).write_text(html_text, encoding='utf-8', errors='replace') + + # Collect attached files + for dlf in j.get('download_files', []) or []: + download_file_name = filter_filename(dlf.get('label') or 'file') + download_file_url = dlf.get('download_url') + if download_file_url: + add_download_task(download_file_url, dc / download_file_name, "file") + + index += 1 + continue + + # PDF - Collect download tasks + if ctype == 'Pdf': + dc = chapter_path / filter_filename(f"{index}. {match['name']}") + dc.mkdir(exist_ok=True) + + j = api_get(f"/api/course_player/v2/pdfs/{match['contentable']}") + if j: + pdf = j.get('pdf', {}) + pdf_url = pdf.get('url') + if pdf_url: + fname = filter_filename(Path(urlparse(pdf_url).path).name) + add_download_task(pdf_url, dc / fname, "pdf") + + index += 1 + continue + + # Download (shared files) - Collect download tasks + if ctype == 'Download': + dc = chapter_path / filter_filename(f"{index}. {match['name']}") + dc.mkdir(exist_ok=True) + + j = api_get(f"/api/course_player/v2/downloads/{match['contentable']}") + if j: + for dlf in j.get('download_files', []) or []: + label = filter_filename(dlf.get('label') or 'file') + url = dlf.get('download_url') + if url: + add_download_task(url, dc / label, "file") + + index += 1 + continue + + # Audio - Collect download tasks + if ctype == 'Audio': + dc = chapter_path / filter_filename(f"{index}. {match['name']}") + dc.mkdir(exist_ok=True) + + j = api_get(f"/api/course_player/v2/audio/{match['contentable']}") + if j: + audio = j.get('audio', {}) + audio_url = audio.get('url') + if audio_url: + fname = filter_filename(Path(urlparse(audio_url).path).name) + add_download_task(audio_url, dc / fname, "audio") + + index += 1 + continue + + # Presentation - Collect download tasks + if ctype == 'Presentation': + dc = chapter_path / filter_filename(f"{index}. {match['name']}") + dc.mkdir(exist_ok=True) + + j = api_get(f"/api/course_player/v2/presentations/{match['contentable']}") + if j: + pres = j.get('presentation', {}) + pdf_url = pres.get('source_file_url') + pdf_name = filter_filename(pres.get('source_file_name') or 'slides.pdf') + if pdf_url: + add_download_task(pdf_url, dc / pdf_name, "presentation") + + # Handle presentation merging - collect slide assets + merge_flag = SETTINGS.ffmpeg_presentation_merge if SETTINGS else False + if merge_flag: + from shutil import which + if which('ffmpeg'): + items = j.get('presentation_items') or [] + for it in items: + pos = it.get('position') + img_url = it.get('image_file_url') + aud_url = it.get('audio_file_url') + if img_url: + img_url = 'https:' + img_url if img_url.startswith('//') else img_url + img_name = filter_filename(f"{pos}{it.get('image_file_name','slide.png')}") + add_download_task(img_url, dc / img_name, "image") + if aud_url: + aud_url = 'https:' + aud_url if aud_url.startswith('//') else aud_url + aud_name = filter_filename(f"{pos}{it.get('audio_file_name','audio.m4a')}") + add_download_task(aud_url, dc / aud_name, "audio") + + index += 1 + continue + + # Quiz - Handle separately (complex logic) + if ctype == 'Quiz': + dc = chapter_path / filter_filename(f"{index}. {match['name']} Quiz") + dc.mkdir(exist_ok=True) + + fname = filter_filename(f"{match['name']} Answers.html") + qname = filter_filename(f"{match['name']} Questions.html") + + result = api_get(f"/api/course_player/v2/quizzes/{match['contentable']}") + if result: + file_contents_with_answers = "