natureinpots_community/plugins/media/tasks.py

# File: plugins/media/tasks.py

import os
import shutil
import zipfile
from werkzeug.utils import secure_filename
from PIL import Image, UnidentifiedImageError
from celery.schedules import crontab
from flask import current_app
from app import db
from app.celery_app import celery
from plugins.media.models import Media, ZipJob

# ─── Constants ────────────────────────────────────────────────────────────────
IMAGE_EXTS    = {'.jpg', '.jpeg', '.png', '.gif', '.webp'}
DOC_EXTS      = {'.pdf', '.txt', '.csv'}
MAX_ZIP_FILES = 1000
MAX_PIXELS    = 8000 * 8000


def validate_image(path):
    try:
        with Image.open(path) as img:
            img.verify()
        w, h = Image.open(path).size
        return (w * h) <= MAX_PIXELS
    except (UnidentifiedImageError, IOError):
        return False


@celery.task(
    bind=True,
    name='plugins.media.tasks.process_zip',
    queue='media'
)
def process_zip(self, job_id, zip_path):
    """
    Unpack and validate a user‐uploaded ZIP batch.
    """
    job = ZipJob.query.get(job_id)
    job.status = 'processing'
    db.session.commit()

    extract_dir = f"{zip_path}_contents"
    try:
        with zipfile.ZipFile(zip_path) as zf:
            names = zf.namelist()
            if len(names) > MAX_ZIP_FILES:
                raise ValueError('ZIP contains too many files.')

            os.makedirs(extract_dir, exist_ok=True)
            for member in names:
                safe = secure_filename(member)
                if safe != member:
                    raise ValueError(f'Illegal filename {member}')

                _, ext = os.path.splitext(safe.lower())
                if ext not in IMAGE_EXTS | DOC_EXTS:
                    raise ValueError(f'Unsupported type {ext}')

                target = os.path.join(extract_dir, safe)
                with zf.open(member) as src, open(target, 'wb') as dst:
                    dst.write(src.read())

                if ext in IMAGE_EXTS:
                    if not validate_image(target):
                        raise ValueError(f'Bad image: {member}')
                elif ext == '.pdf':
                    with open(target, 'rb') as f:
                        header = f.read(5)
                    if header != b'%PDF-':
                        raise ValueError(f'Bad PDF: {member}')
                else:
                    with open(target, 'rb') as f:
                        f.read(1024).decode('utf-8')

        job.status = 'done'
    except Exception as e:
        job.status = 'failed'
        job.error  = str(e)
    finally:
        db.session.commit()
        if os.path.isdir(extract_dir):
            shutil.rmtree(extract_dir)


@celery.on_after_configure.connect
def setup_periodic_tasks(sender, **kwargs):
    """
    Schedule periodic media prune job every day at 2am.
    """
    sender.add_periodic_task(
        crontab(hour=2, minute=0),
        prune_orphans.s(),
        name='media_prune',
        queue='media'
    )


@celery.task(
    name='plugins.media.tasks.prune_orphans',
    queue='media'
)
def prune_orphans():
    """
    Mark orphaned Media records, move their files to /static/orphaned/,
    and log the change in the DB.
    """
    orphan_dir = os.path.join(current_app.root_path, 'static', 'orphaned')
    os.makedirs(orphan_dir, exist_ok=True)

    candidates = Media.query.filter(
        Media.status == 'active',
        Media.plant_id.is_(None),
        Media.growlog_id.is_(None),
        Media.related_id.is_(None)
    ).all()

    for m in candidates:
        src_rel = m.file_url.lstrip('/')
        src_abs = os.path.join(current_app.root_path, src_rel)
        if not os.path.isfile(src_abs):
            current_app.logger.warning(f"Orphan prune: file not found {src_abs}")
            continue

        filename = os.path.basename(src_abs)
        dest_abs = os.path.join(orphan_dir, filename)
        shutil.move(src_abs, dest_abs)

        new_url = f"/static/orphaned/{filename}"
        m.mark_orphaned(new_url)

        current_app.logger.info(
            f"Orphaned media #{m.id}: moved {src_rel} → {new_url}"
        )

    db.session.commit()


def init_media_tasks(celery_app):
    """
    Called by the JSON‐driven loader so tasks_init no longer errors.
    Celery scheduling is handled via on_after_configure.
    """
    celery_app.logger.info("[Media] init_media_tasks called (no‐op)")