import re, os, hashlib
from pathlib import Path
from urllib.parse import urljoin, urlparse
from urllib.request import Request, urlopen

news_dir = Path(r"C:\Users\poc\Downloads\WebScrapBook\data\20260429163450072\news")
offline_root = news_dir / "_offline"
offline_root.mkdir(parents=True, exist_ok=True)

source_map = {
    "news-1.html": "https://blog.blockstream.com/blockstream-research-brings-libsecp256k1-zkp-back-up-to-speed/",
    "news-2.html": "https://blog.blockstream.com/on-chain-swaps-and-lightning-come-to-the-blockstream-desktop-app/",
    "news-3.html": "https://blog.blockstream.com/blockstream-research-demonstrates-quantum-resistant-transaction-signing-on-liquid-using-simplicity-smart-contracts/",
    "news-4.html": "https://blog.blockstream.com/the-risks-of-expressive-smart-contracts-lessons-from-the-latest-ethereum-hack/",
    "news-5.html": "https://blog.blockstream.com/confidentiality-made-affordable-elip-200-unlocks-up-to-90-lower-fees-on-liquid/",
}

attr_re = re.compile(r'(src|href)=["\']([^"\']+)["\']', re.I)
allow_hosts = {"blog.blockstream.com", "blockstream.com", "blockstream.imgix.net", "use.typekit.net", "fonts.googleapis.com", "fonts.gstatic.com"}


def is_asset(url: str):
    p = urlparse(url)
    if p.scheme not in ("http", "https"):
        return False
    if p.netloc not in allow_hosts:
        return False
    path = p.path.lower()
    return (
        "/assets/" in path
        or any(path.endswith(ext) for ext in [".css", ".js", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp", ".woff", ".woff2", ".ttf", ".ico"])
    )


def local_path(url: str):
    p = urlparse(url)
    path = p.path or "/file.bin"
    if path.endswith("/"):
        path += "index.html"
    rel = Path(p.netloc) / path.lstrip("/")
    if p.query:
        qh = hashlib.md5(p.query.encode()).hexdigest()[:8]
        rel = rel.with_name(rel.stem + "_q" + qh + rel.suffix)
    return offline_root / rel

cache = {}
count = 0
for fname, base_url in source_map.items():
    f = news_dir / fname
    html = f.read_text(encoding="utf-8", errors="ignore")
    new_html = html
    for m in attr_re.finditer(html):
        raw = m.group(2)
        absu = urljoin(base_url, raw)
        if not is_asset(absu):
            continue
        if absu not in cache:
            try:
                req = Request(absu, headers={"User-Agent": "Mozilla/5.0"})
                with urlopen(req, timeout=10) as r:
                    data = r.read()
                out = local_path(absu)
                out.parent.mkdir(parents=True, exist_ok=True)
                out.write_bytes(data)
                cache[absu] = out
                count += 1
            except Exception:
                cache[absu] = None
        out = cache.get(absu)
        if out:
            rel = os.path.relpath(out, news_dir).replace('\\', '/')
            new_html = new_html.replace(raw, rel)
    f.write_text(new_html, encoding="utf-8")

print('downloaded', count, 'assets')