﻿import re, os, hashlib
from pathlib import Path
from urllib.parse import urljoin, urlparse
from urllib.request import Request, urlopen

base = Path(r"C:\Users\poc\Downloads\WebScrapBook\data\20260429163450072")
news_dir = base / "news"
offline_root = news_dir / "_offline"
offline_root.mkdir(parents=True, exist_ok=True)

source_map = {
    "news-1.html": "https://blog.blockstream.com/blockstream-research-brings-libsecp256k1-zkp-back-up-to-speed/",
    "news-2.html": "https://blog.blockstream.com/on-chain-swaps-and-lightning-come-to-the-blockstream-desktop-app/",
    "news-3.html": "https://blog.blockstream.com/blockstream-research-demonstrates-quantum-resistant-transaction-signing-on-liquid-using-simplicity-smart-contracts/",
    "news-4.html": "https://blog.blockstream.com/the-risks-of-expressive-smart-contracts-lessons-from-the-latest-ethereum-hack/",
    "news-5.html": "https://blog.blockstream.com/confidentiality-made-affordable-elip-200-unlocks-up-to-90-lower-fees-on-liquid/",
}

ua = {"User-Agent": "Mozilla/5.0"}
attr_re = re.compile(r'(?:src|href)=["\']([^"\']+)["\']', re.I)
css_url_re = re.compile(r'url\(([^)]+)\)')

visited = {}


def local_path_for(url: str):
    p = urlparse(url)
    path = p.path or "/index"
    if path.endswith("/"):
        path += "index.html"
    ext = Path(path).suffix
    if not ext:
        path += ".bin"
    rel = Path(p.netloc) / path.lstrip("/")
    if p.query:
        qh = hashlib.md5(p.query.encode("utf-8", "ignore")).hexdigest()[:8]
        rel = rel.with_name(rel.stem + "_q" + qh + rel.suffix)
    return offline_root / rel


def download(url: str):
    if url in visited:
        return visited[url]
    try:
        req = Request(url, headers=ua)
        with urlopen(req, timeout=20) as r:
            data = r.read()
            ctype = (r.headers.get("Content-Type") or "").lower()
    except Exception:
        visited[url] = None
        return None

    out = local_path_for(url)
    out.parent.mkdir(parents=True, exist_ok=True)
    out.write_bytes(data)
    visited[url] = (out, ctype)

    # rewrite nested css urls
    if "text/css" in ctype or out.suffix.lower() == ".css":
        try:
            txt = out.read_text(encoding="utf-8", errors="ignore")
            changed = txt
            for m in css_url_re.finditer(txt):
                raw = m.group(1).strip().strip('"\'')
                if not raw or raw.startswith("data:") or raw.startswith("#"):
                    continue
                absu = urljoin(url, raw)
                dl = download(absu)
                if not dl:
                    continue
                local_file, _ = dl
                rel = os.path.relpath(local_file, out.parent).replace('\\', '/')
                changed = changed.replace(m.group(0), f"url('{rel}')")
            if changed != txt:
                out.write_text(changed, encoding="utf-8")
        except Exception:
            pass

    return visited[url]


for name, src_url in source_map.items():
    fp = news_dir / name
    html = fp.read_text(encoding="utf-8", errors="ignore")
    new_html = html

    matches = list(attr_re.finditer(html))
    for m in matches:
        link = m.group(1).strip()
        if (not link or link.startswith("#") or link.startswith("javascript:") or link.startswith("mailto:") or link.startswith("tel:")):
            continue
        abs_url = urljoin(src_url, link)
        if not abs_url.startswith("http"):
            continue
        dl = download(abs_url)
        if not dl:
            continue
        local_file, _ = dl
        rel = os.path.relpath(local_file, news_dir).replace('\\', '/')
        new_html = new_html.replace(link, rel)

    fp.write_text(new_html, encoding="utf-8")

print(f"done files={len(source_map)} downloaded={sum(1 for v in visited.values() if v)}")
