﻿from pathlib import Path
import re

news_dir = Path(r"C:\Users\poc\Downloads\WebScrapBook\data\20260429163450072\news")

patterns = [
    (r'For technical questions[^<]{0,400}', 'For technical questions, see project documentation.'),
    (r'To learn more about Simplicity,[^<]{0,300}', 'To learn more about Simplicity, see documentation.'),
    (r'For more on Blockstream Research,[^<]{0,300}', 'For more on Blockstream Research, see the research section.'),
    (r'\bvisit\s+(?=[\.,;:])', ''),
    (r'\s+,', ','),
    (r'\s+\.', '.'),
    (r'\(\s*\)', ''),
    (r'\s{2,}', ' '),
]

for f in news_dir.glob('news-*.html'):
    s = f.read_text(encoding='utf-8', errors='ignore')
    for pat, rep in patterns:
        s = re.sub(pat, rep, s, flags=re.I)
    f.write_text(s, encoding='utf-8')

print('normalized')
