mirror of
https://github.com/lobehub/lobehub
synced 2026-04-21 09:37:28 +00:00
🐛 fix(web-crawler): prevent happy-dom CSS parsing crash in htmlToMarkdown (#13652)
- Disable CSS file loading and JS evaluation in happy-dom Window (root cause) - Add try-catch around Readability.parse() for defense in depth - Add regression tests for invalid CSS selectors and external stylesheet links Closes LOBE-6869 Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
accc173068
commit
a6e330cfa9
2 changed files with 44 additions and 3 deletions
|
|
@ -50,6 +50,39 @@ describe('htmlToMarkdown', () => {
|
|||
expect(result.content.length).toBeLessThan(html.length);
|
||||
}, 20000);
|
||||
|
||||
it('should not crash on HTML with invalid CSS selectors (LOBE-6869)', () => {
|
||||
// Regression: happy-dom throws TypeError on pages with CSS selectors it cannot parse.
|
||||
// htmlToMarkdown must not propagate this — it should fall back to raw HTML conversion.
|
||||
const html = `
|
||||
<html><head>
|
||||
<style>:is(.foo, :has(> .bar)) { color: red }</style>
|
||||
</head><body>
|
||||
<script type="application/ld+json">{"@type":"Article","name":"Test"}</script>
|
||||
<p>Valid content here</p>
|
||||
</body></html>`;
|
||||
|
||||
const result = htmlToMarkdown(html, { url: 'https://example.com', filterOptions: {} });
|
||||
|
||||
expect(result).toBeDefined();
|
||||
expect(result.content).toContain('Valid content');
|
||||
});
|
||||
|
||||
it('should not crash on HTML with external stylesheet links (LOBE-6869)', () => {
|
||||
// Regression: happy-dom's HTMLLinkElement.#loadStyleSheet can crash on CSS parsing.
|
||||
// disableCSSFileLoading should prevent this path entirely.
|
||||
const html = `
|
||||
<html><head>
|
||||
<link rel="stylesheet" href="https://example.com/styles.css">
|
||||
</head><body>
|
||||
<p>Content with external CSS</p>
|
||||
</body></html>`;
|
||||
|
||||
const result = htmlToMarkdown(html, { url: 'https://example.com', filterOptions: {} });
|
||||
|
||||
expect(result).toBeDefined();
|
||||
expect(result.content).toContain('Content with external CSS');
|
||||
});
|
||||
|
||||
it('should not truncate HTML under 1 MB', () => {
|
||||
const html = '<html><body><p>Small content</p></body></html>';
|
||||
|
||||
|
|
|
|||
|
|
@ -31,13 +31,21 @@ export const htmlToMarkdown = (
|
|||
{ url, filterOptions }: { filterOptions: FilterOptions; url: string },
|
||||
): HtmlToMarkdownOutput => {
|
||||
const html = rawHtml.length > MAX_HTML_SIZE ? rawHtml.slice(0, MAX_HTML_SIZE) : rawHtml;
|
||||
const window = new Window({ url });
|
||||
const window = new Window({
|
||||
settings: { disableCSSFileLoading: true, disableJavaScriptEvaluation: true },
|
||||
url,
|
||||
});
|
||||
|
||||
const document = window.document;
|
||||
document.body.innerHTML = html;
|
||||
|
||||
// @ts-expect-error reason: Readability expects a Document type
|
||||
const parsedContent = new Readability(document).parse();
|
||||
let parsedContent: ReturnType<Readability<string>['parse']> = null;
|
||||
try {
|
||||
// @ts-expect-error reason: Readability expects a Document type
|
||||
parsedContent = new Readability(document).parse();
|
||||
} catch {
|
||||
// happy-dom may throw on pages with invalid CSS selectors — fall back to raw HTML
|
||||
}
|
||||
|
||||
const useReadability = filterOptions.enableReadability ?? true;
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue