#!/usr/bin/env python3
"""Generate a JSON catalogue of remote interactives hosted on iwant2study.org.

This script crawls the Apache-style directory listings under
    https://iwant2study.org/lookangejss/slsMasterPrompt/

and discovers HTML interactives to expose in the same catalogue format used by
`generate_prompt_catalogue.py`, but with **absolute URLs** for `indexPath`.

Output:
    catalogue_remote.json

Usage (from this folder):
    python generate_remote_catalogue.py

Notes / Design choices
----------------------
- To avoid overloading the server, the script:
  * Uses only the Python standard library (no extra dependencies).
  * Has a small delay between directory requests.
  * Limits the total number of recorded items via MAX_ITEMS.
- By default, it starts at the root `lookangejss/slsMasterPrompt/` directory and recurses.
  You can narrow the crawl to selected subtrees by editing START_PATHS below,
  e.g. START_PATHS = ["promptLibrary/"] or ["physics/", "math/"]
- For now, we only record the HTML entry itself (indexPath and basic metadata).
  We **do not** try to detect remote prompt.txt / ZIP / prompt_image files to
  keep the number of HTTP calls reasonable.
"""

from __future__ import annotations

import json
import time
from dataclasses import dataclass
from html.parser import HTMLParser
from pathlib import PurePosixPath
from typing import List, Set
from urllib.error import HTTPError, URLError
from urllib.parse import urljoin
from urllib.request import urlopen

BASE_URL = "https://iwant2study.org/lookangejss/slsMasterPrompt/"
# Relative paths (from BASE_URL) to start crawling. Empty string means root.
START_PATHS = [""]  # you can change to ["promptLibrary/"] to restrict scope

# Safety limits
REQUEST_DELAY_SEC = 0.15  # polite delay between directory requests
MAX_ITEMS = 2000          # hard cap on total interactives recorded
MAX_DEPTH = 10            # max directory depth from BASE_URL


@dataclass
class CatalogueItem:
    id: str
    title: str
    folder: str
    folderPath: str
    indexPath: str
    hasPrompt: bool = False
    promptText: str = ""
    hasZip: bool = False
    zipPath: str | None = None
    hasPromptImage: bool = False
    promptImagePath: str | None = None
    promptImageExt: str | None = None
    hasKnowledgeBase: bool = False
    knowledgeBasePath: str | None = None
    knowledgeBaseExt: str | None = None

    def to_dict(self) -> dict:
        return {
            "id": self.id,
            "title": self.title,
            "folder": self.folder,
            "folderPath": self.folderPath,
            "indexPath": self.indexPath,
            "hasPrompt": self.hasPrompt,
            "promptText": self.promptText,
            "hasZip": self.hasZip,
            "zipPath": self.zipPath,
            "hasPromptImage": self.hasPromptImage,
            "promptImagePath": self.promptImagePath,
            "promptImageExt": self.promptImageExt,
            "hasKnowledgeBase": self.hasKnowledgeBase,
            "knowledgeBasePath": self.knowledgeBasePath,
            "knowledgeBaseExt": self.knowledgeBaseExt,
        }


class LinkParser(HTMLParser):
    """Very simple HTML anchor parser for Apache directory listings."""

    def __init__(self) -> None:
        super().__init__()
        self.links: List[str] = []

    def handle_starttag(self, tag: str, attrs):  # type: ignore[override]
        if tag.lower() != "a":
            return
        href = None
        for k, v in attrs:
            if k.lower() == "href":
                href = v
                break
        if href:
            self.links.append(href)


def is_apache_index_page(html: str) -> bool:
    """Heuristically detect an Apache-style directory index page.

    We look for the typical "Index of /path" header and sort links like
    `?C=N;O=D` which should not appear on content pages.
    """
    return "Index of /" in html and "?C=N;O=" in html


def fetch_directory_html(url: str) -> str | None:
    """Fetch a directory listing page as text. Returns None on error."""
    try:
        with urlopen(url) as resp:
            # Some servers may not set a status attribute, fall back to getcode().
            status = getattr(resp, "status", None) or resp.getcode()
            if status >= 400:
                print(f"[WARN] HTTP {status} on {url}")
                return None
            content_bytes = resp.read()
            return content_bytes.decode("utf-8", errors="ignore")
    except (HTTPError, URLError) as e:
        print(f"[WARN] Error fetching {url}: {e}")
        return None


def is_subdirectory_href(href: str) -> bool:
    href = href.strip()
    if not href:
        return False
    # Ignore absolute paths that escape the BASE_URL subtree
    if href.startswith("/"):
        return False
    if href.startswith("?"):
        # Sorting / query links like ?C=N;O=D
        return False
    if href in {"/", "../"}:
        return False
    if "#" in href:
        return False
    return href.endswith("/")


def is_html_file_href(href: str) -> bool:
    href = href.strip()
    if not href:
        return False
    # Ignore absolute paths that escape the BASE_URL subtree
    if href.startswith("/"):
        return False
    if href.startswith("?"):
        return False
    if href.endswith("/"):
        return False
    lower = href.lower()
    return lower.endswith(".html")


def depth_from_root(rel_path: str) -> int:
    rel_path = rel_path.strip("/")
    if not rel_path:
        return 0
    return rel_path.count("/") + 1


def crawl_directory(rel_path: str, items: List[CatalogueItem], visited: Set[str]) -> None:
    """Recursively crawl a directory under BASE_URL.

    rel_path: path relative to BASE_URL, e.g. "02_newtonianmechanics_7energyworkpower/".
    """
    if len(items) >= MAX_ITEMS:
        return

    # Normalise rel_path to always end with '/'
    if rel_path and not rel_path.endswith("/"):
        rel_path = rel_path + "/"

    url = urljoin(BASE_URL, rel_path)
    if url in visited:
        return
    visited.add(url)

    if depth_from_root(rel_path) > MAX_DEPTH:
        print(f"[INFO] Skipping {url} (depth>{MAX_DEPTH})")
        return

    print(f"[INFO] Crawling directory: {url}")
    html = fetch_directory_html(url)
    if html is None:
        return

    # Special handling for slsMasterPrompt: each *top-level* directory is an
    # interactive whose content is served directly at the directory URL
    # (typically via an underlying index.html). These pages are not Apache
    # index listings, so they would otherwise produce no .html links.
    if BASE_URL.endswith("slsMasterPrompt/") and rel_path and depth_from_root(rel_path) == 1:
        if not is_apache_index_page(html):
            # Treat this directory URL itself as the interactive entry point.
            # Use the folder name as the title (underscores -> spaces).
            p = PurePosixPath(rel_path.strip("/"))
            folder_name = p.name
            title = folder_name.replace("_", " ")

            # Avoid duplicates
            existing_paths = {it.indexPath for it in items}
            if url not in existing_paths:
                # Compute a matching ZIP path if it exists (same name + .zip
                # at the slsMasterPrompt root). We don't verify existence
                # here to keep requests minimal.
                zip_url = urljoin(BASE_URL, f"{folder_name}.zip")

                item = CatalogueItem(
                    id="remote:" + str(p),
                    title=title,
                    folder=folder_name,
                    folderPath=str(p) + "/",
                    indexPath=url,
                    hasZip=True,
                    zipPath=zip_url,
                )
                items.append(item)
                print(f"  [ADD] {title} -> {url}")

            # We still continue to parse this page for any additional
            # subdirectories/links, but we skip the normal HTML-file loop
            # because the directory URL itself is the main entry point.

    parser = LinkParser()
    parser.feed(html)

    # First process HTML files in this directory
    for href in parser.links:
        if not is_html_file_href(href):
            continue
        rel_file = rel_path + href.strip()
        add_html_item(rel_file, items)
        if len(items) >= MAX_ITEMS:
            print(f"[INFO] Reached MAX_ITEMS={MAX_ITEMS}, stopping crawl.")
            return

    # Then recurse into subdirectories
    for href in parser.links:
        if not is_subdirectory_href(href):
            continue
        sub_rel_path = rel_path + href.strip()
        crawl_directory(sub_rel_path, items, visited)
        if len(items) >= MAX_ITEMS:
            return

    # Be polite to the server
    time.sleep(REQUEST_DELAY_SEC)


def add_html_item(rel_index: str, items: List[CatalogueItem]) -> None:
    """Add a CatalogueItem for a discovered HTML file.

    rel_index is the path *relative to BASE_URL*, e.g.
        "02_newtonianmechanics_7energyworkpower/ejss_model_projectileprimary/index.html"
    """
    p = PurePosixPath(rel_index)
    if p.suffix.lower() != ".html":
        return

    # Avoid duplicates
    existing_paths = {it.indexPath for it in items}
    full_url = urljoin(BASE_URL, rel_index)
    if full_url in existing_paths:
        return

    if p.name == "index.html":
        folder_path = str(p.parent)  # e.g. "02_newtonian.../ejss_model_projectileprimary"
        folder_name = p.parent.name or "index"
        title = folder_name.replace("_", " ")
    else:
        folder_path = str(p.parent)
        folder_name = p.stem
        title = folder_name.replace("_", " ")

    if folder_path and not folder_path.endswith("/"):
        folder_path = folder_path + "/"

    # Prefix id with "remote:" to avoid collisions with local catalogue ids
    item_id = "remote:" + str(PurePosixPath(rel_index).with_suffix(""))

    item = CatalogueItem(
        id=item_id,
        title=title,
        folder=folder_name,
        folderPath=folder_path,
        indexPath=full_url,
    )

    items.append(item)
    print(f"  [ADD] {title} -> {full_url}")


def main() -> None:
    items: List[CatalogueItem] = []
    visited: Set[str] = set()

    for start in START_PATHS:
        crawl_directory(start, items, visited)

    # Sort by title like the local generator does
    items.sort(key=lambda x: x.title.lower())

    data = [it.to_dict() for it in items]
    output_path = "catalogue_remote.json"
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

    print(f"\nWrote {len(items)} remote items to {output_path}")


if __name__ == "__main__":
    main()
