import re, os, hashlib
from pathlib import Path
from urllib.parse import urlparse
from urllib.request import Request, urlopen

news_dir = Path(r"C:\Users\poc\Downloads\WebScrapBook\data\20260429163450072\news")
offline_root = news_dir / "_offline"

src_re = re.compile(r'src=["\'](https?://[^"\']+)["\']', re.I)
href_re = re.compile(r'href=["\']https?://[^"\']+["\']', re.I)


def local_path(url: str):
    p = urlparse(url)
    path = p.path or "/file.bin"
    if path.endswith('/'):
        path += 'index.html'
    rel = Path(p.netloc) / path.lstrip('/')
    if p.query:
        qh = hashlib.md5(p.query.encode()).hexdigest()[:8]
        rel = rel.with_name(rel.stem + '_q' + qh + rel.suffix)
    out = offline_root / rel
    out.parent.mkdir(parents=True, exist_ok=True)
    return out

for f in news_dir.glob('news-*.html'):
    html = f.read_text(encoding='utf-8', errors='ignore')

    # localize remaining absolute src URLs
    for m in src_re.finditer(html):
        url = m.group(1)
        out = local_path(url)
        if not out.exists():
            try:
                req = Request(url, headers={'User-Agent':'Mozilla/5.0'})
                with urlopen(req, timeout=10) as r:
                    out.write_bytes(r.read())
            except Exception:
                continue
        rel = os.path.relpath(out, news_dir).replace('\\','/')
        html = html.replace(url, rel)

    # disable external navigation links for full local experience
    html = href_re.sub('href="#"', html)

    f.write_text(html, encoding='utf-8')

print('done')