🐛 fix(web-crawler): prevent happy-dom CSS parsing crash in htmlToMarkdown (#13652)

- Disable CSS file loading and JS evaluation in happy-dom Window (root cause)
- Add try-catch around Readability.parse() for defense in depth
- Add regression tests for invalid CSS selectors and external stylesheet links

Closes LOBE-6869

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Arvin Xu 2026-04-08 12:59:49 +08:00 committed by GitHub
parent accc173068
commit a6e330cfa9
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 44 additions and 3 deletions

View file

@ -50,6 +50,39 @@ describe('htmlToMarkdown', () => {
expect(result.content.length).toBeLessThan(html.length);
}, 20000);
it('should not crash on HTML with invalid CSS selectors (LOBE-6869)', () => {
// Regression: happy-dom throws TypeError on pages with CSS selectors it cannot parse.
// htmlToMarkdown must not propagate this — it should fall back to raw HTML conversion.
const html = `
<html><head>
<style>:is(.foo, :has(> .bar)) { color: red }</style>
</head><body>
<script type="application/ld+json">{"@type":"Article","name":"Test"}</script>
<p>Valid content here</p>
</body></html>`;
const result = htmlToMarkdown(html, { url: 'https://example.com', filterOptions: {} });
expect(result).toBeDefined();
expect(result.content).toContain('Valid content');
});
it('should not crash on HTML with external stylesheet links (LOBE-6869)', () => {
// Regression: happy-dom's HTMLLinkElement.#loadStyleSheet can crash on CSS parsing.
// disableCSSFileLoading should prevent this path entirely.
const html = `
<html><head>
<link rel="stylesheet" href="https://example.com/styles.css">
</head><body>
<p>Content with external CSS</p>
</body></html>`;
const result = htmlToMarkdown(html, { url: 'https://example.com', filterOptions: {} });
expect(result).toBeDefined();
expect(result.content).toContain('Content with external CSS');
});
it('should not truncate HTML under 1 MB', () => {
const html = '<html><body><p>Small content</p></body></html>';

View file

@ -31,13 +31,21 @@ export const htmlToMarkdown = (
{ url, filterOptions }: { filterOptions: FilterOptions; url: string },
): HtmlToMarkdownOutput => {
const html = rawHtml.length > MAX_HTML_SIZE ? rawHtml.slice(0, MAX_HTML_SIZE) : rawHtml;
const window = new Window({ url });
const window = new Window({
settings: { disableCSSFileLoading: true, disableJavaScriptEvaluation: true },
url,
});
const document = window.document;
document.body.innerHTML = html;
// @ts-expect-error reason: Readability expects a Document type
const parsedContent = new Readability(document).parse();
let parsedContent: ReturnType<Readability<string>['parse']> = null;
try {
// @ts-expect-error reason: Readability expects a Document type
parsedContent = new Readability(document).parse();
} catch {
// happy-dom may throw on pages with invalid CSS selectors — fall back to raw HTML
}
const useReadability = filterOptions.enableReadability ?? true;