🐛 fix: improve crawler error handling and timeout cancellation (#12487)

This commit is contained in:
YuTengjing 2026-02-26 22:59:10 +08:00 committed by GitHub
parent 0365a14e16
commit 306c50704e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
38 changed files with 1462 additions and 784 deletions

View file

@ -0,0 +1,55 @@
---
name: pr
description: "Create a PR for the current branch. Use when the user asks to create a pull request, submit PR, or says 'pr'."
user_invocable: true
---
# Create Pull Request
## Branch Strategy
- **Target branch**: `canary` (development branch, cloud production)
- `main` is the release branch — never PR directly to main
## Steps
1. **Gather context** (run in parallel):
- `git branch --show-current` — current branch name
- `git rev-parse --abbrev-ref @{u} 2>/dev/null` — remote tracking status
- `git log --oneline origin/canary..HEAD` — unpushed commits
- `gh pr list --head "$(git branch --show-current)" --json number,title,state,url` — existing PR
- `git log --oneline origin/canary..HEAD` — commit history for PR title
- `git diff --stat --stat-count=20 origin/canary..HEAD` — change summary
2. **Push if needed**:
- No upstream: `git push -u origin $(git branch --show-current)`
- Has upstream: `git push origin $(git branch --show-current)`
3. **Search related GitHub issues**:
- `gh issue list --search "<keywords>" --state all --limit 10`
- Only link issues with matching scope (avoid large umbrella issues)
- Skip if no matching issue found
4. **Create PR** with `gh pr create --base canary`:
- Title: `<gitmoji> <type>(<scope>): <description>`
- Body: based on PR template (`.github/PULL_REQUEST_TEMPLATE.md`), fill checkboxes
- Link related GitHub issues using magic keywords (`Fixes #123`, `Closes #123`)
- Link Linear issues if applicable (`Fixes LOBE-xxx`)
- Use HEREDOC for body to preserve formatting
5. **Open in browser**: `gh pr view --web`
## PR Template
Use `.github/PULL_REQUEST_TEMPLATE.md` as the body structure. Key sections:
- **Change Type**: Check the appropriate gitmoji type
- **Related Issue**: Link GitHub/Linear issues with magic keywords
- **Description of Change**: Summarize what and why
- **How to Test**: Describe test approach, check relevant boxes
## Notes
- **Release impact**: PR titles with `✨ feat/` or `🐛 fix` trigger releases — use carefully
- **Language**: All PR content must be in English
- If a PR already exists for the branch, inform the user instead of creating a duplicate

View file

@ -1,3 +1,8 @@
---
name: upstash-workflow
description: 'Upstash Workflow implementation guide. Use when creating async workflows with QStash, implementing fan-out patterns, or building 3-layer workflow architecture (process → paginate → execute).'
---
# Upstash Workflow Implementation Guide
This guide covers the standard patterns for implementing Upstash Workflow + QStash async workflows in the LobeHub codebase.

View file

@ -38,7 +38,8 @@ lobe-chat/
### Git Workflow
- The current release branch is `next` until v2.0.0 is officially released
- **Branch strategy**: `canary` is the development branch (cloud production); `main` is the release branch (periodically cherry-picks from canary)
- New branches should be created from `canary`; PRs should target `canary`
- Use rebase for git pull
- Git commit messages should prefix with gitmoji
- Git branch name format: `username/feat/feature-name`

View file

@ -33,6 +33,8 @@ lobe-chat/
### Git Workflow
- **Branch strategy**: `canary` is the development branch (cloud production); `main` is the release branch (periodically cherry-picks from canary)
- New branches should be created from `canary`; PRs should target `canary`
- Use rebase for `git pull`
- Commit messages: prefix with gitmoji
- Branch format: `<type>/<feature-name>`

View file

@ -33,6 +33,8 @@ lobe-chat/
### Git Workflow
- **Branch strategy**: `canary` is the development branch (cloud production); `main` is the release branch (periodically cherry-picks from canary)
- New branches should be created from `canary`; PRs should target `canary`
- Use rebase for `git pull`
- Commit messages: prefix with gitmoji
- Branch format: `<type>/<feature-name>`

View file

@ -51,6 +51,24 @@ Supported crawler types are listed below:
---
## `CRAWL_CONCURRENCY`
Controls crawler concurrency per crawl task. The default is `3`. On low-resource servers, use `1` to reduce CPU spikes.
```env
CRAWL_CONCURRENCY=3
```
## `CRAWLER_RETRY`
Controls retry attempts per URL on crawl failures. The default is `1` (up to 2 attempts total).
```env
CRAWLER_RETRY=1
```
---
## `SEARCH_PROVIDERS`
Configure which search engine providers to use for web search.

View file

@ -46,6 +46,24 @@ CRAWLER_IMPLS="naive,search1api"
---
## `CRAWL_CONCURRENCY`
控制单次网页抓取任务的并发数量,默认值为 `3`。在低配置服务器上建议设置为 `1` 以降低 CPU 峰值。
```env
CRAWL_CONCURRENCY=3
```
## `CRAWLER_RETRY`
控制单个 URL 的抓取失败重试次数,默认值为 `1`(即最多尝试 2 次)。
```env
CRAWLER_RETRY=1
```
---
## `SEARCH_PROVIDERS`
配置联网搜索使用的搜索引擎提供商。

View file

@ -1,4 +1,4 @@
import { describe, expect, it, vi } from 'vitest';
import { beforeEach, describe, expect, it, vi } from 'vitest';
import { Crawler } from '../crawler';
@ -19,6 +19,16 @@ vi.mock('../utils/appUrlRules', () => ({
}));
describe('Crawler', () => {
beforeEach(async () => {
vi.clearAllMocks();
// Reset applyUrlRules to default (no impls override)
const { applyUrlRules } = await import('../utils/appUrlRules');
vi.mocked(applyUrlRules).mockReturnValue({
transformedUrl: 'https://example.com',
filterOptions: {},
});
});
const crawler = new Crawler();
it('should crawl successfully with default impls', async () => {
@ -194,11 +204,12 @@ describe('Crawler', () => {
});
expect(result).toEqual({
crawler: undefined,
crawler: 'browserless',
data: {
content: 'Fail to crawl the page. Error type: UnknownError, error message: undefined',
errorMessage: undefined,
errorType: 'UnknownError',
content:
'Fail to crawl the page. Error type: EmptyCrawlResultError, error message: browserless returned empty or short content',
errorMessage: 'browserless returned empty or short content',
errorType: 'EmptyCrawlResultError',
},
originalUrl: 'https://example.com',
transformedUrl: undefined,

View file

@ -1,7 +1,13 @@
import { describe, expect, it, vi } from 'vitest';
import * as withTimeoutModule from '../../utils/withTimeout';
import { browserless } from '../browserless';
// Mock withTimeout to just call the factory function directly (bypassing real timeout)
vi.spyOn(withTimeoutModule, 'withTimeout').mockImplementation((fn) =>
fn(new AbortController().signal),
);
describe('browserless', () => {
it('should throw BrowserlessInitError when env vars not set', async () => {
const originalEnv = { ...process.env };
@ -16,17 +22,22 @@ describe('browserless', () => {
process.env = originalEnv;
});
it('should return undefined on fetch error', async () => {
it('should throw NetworkConnectionError on fetch failed', async () => {
process.env.BROWSERLESS_TOKEN = 'test-token';
global.fetch = vi.fn().mockRejectedValue(new Error('Fetch error'));
global.fetch = vi.fn().mockRejectedValue(new TypeError('fetch failed'));
const result = await browserless('https://example.com', { filterOptions: {} });
expect(result).toBeUndefined();
const { NetworkConnectionError } = await import('../../utils/errorType');
await expect(browserless('https://example.com', { filterOptions: {} })).rejects.toThrow(
NetworkConnectionError,
);
});
it('should return undefined when content is empty', async () => {
process.env.BROWSERLESS_TOKEN = 'test-token';
global.fetch = vi.fn().mockResolvedValue({
ok: true,
status: 200,
statusText: 'OK',
text: vi.fn().mockResolvedValue('<html></html>'),
} as any);
@ -37,6 +48,9 @@ describe('browserless', () => {
it('should return undefined when title is "Just a moment..."', async () => {
process.env.BROWSERLESS_TOKEN = 'test-token';
global.fetch = vi.fn().mockResolvedValue({
ok: true,
status: 200,
statusText: 'OK',
text: vi.fn().mockResolvedValue('<html><title>Just a moment...</title></html>'),
} as any);
@ -46,7 +60,12 @@ describe('browserless', () => {
it('should return crawl result on successful fetch', async () => {
process.env.BROWSERLESS_TOKEN = 'test-token';
const longContent =
'This is a test paragraph with enough content to pass the length check. '.repeat(3);
global.fetch = vi.fn().mockResolvedValue({
ok: true,
status: 200,
statusText: 'OK',
text: vi.fn().mockResolvedValue(`
<html>
<head>
@ -54,7 +73,7 @@ describe('browserless', () => {
<meta name="description" content="Test Description">
</head>
<body>
<h1>Test Content</h1>
<p>${longContent}</p>
</body>
</html>
`),
@ -76,6 +95,9 @@ describe('browserless', () => {
it('should include rejectRequestPattern in request payload', async () => {
process.env.BROWSERLESS_TOKEN = 'test-token';
const fetchMock = vi.fn().mockResolvedValue({
ok: true,
status: 200,
statusText: 'OK',
text: vi.fn().mockResolvedValue('<html><title>Test</title></html>'),
});
global.fetch = fetchMock;
@ -90,9 +112,7 @@ describe('browserless', () => {
it('should allow requests to permitted file types', async () => {
const allowedExtensions = ['html', 'css', 'js', 'json', 'xml', 'webmanifest', 'txt', 'md'];
const pattern = new RegExp(
'.*\\.(?!(html|css|js|json|xml|webmanifest|txt|md)(\\?|#|$))[\\w-]+(?:[?#].*)?$',
);
const pattern = /.*\.(?!(html|css|js|json|xml|webmanifest|txt|md)(\?|#|$))[\w-]+(?:[?#].*)?$/;
allowedExtensions.forEach((ext) => {
expect(`file.${ext}`).not.toMatch(pattern);
@ -103,9 +123,7 @@ describe('browserless', () => {
it('should reject requests to non-permitted file types', async () => {
const rejectedExtensions = ['jpg', 'png', 'gif', 'pdf', 'doc', 'mp4', 'wav'];
const pattern = new RegExp(
'.*\\.(?!(html|css|js|json|xml|webmanifest|txt|md)(\\?|#|$))[\\w-]+(?:[?#].*)?$',
);
const pattern = /.*\.(?!(html|css|js|json|xml|webmanifest|txt|md)(\?|#|$))[\w-]+(?:[?#].*)?$/;
rejectedExtensions.forEach((ext) => {
expect(`file.${ext}`).toMatch(pattern);
@ -114,14 +132,16 @@ describe('browserless', () => {
});
});
it('should use correct URL when BROWSERLESS_URL is provided', async () => {
const customUrl = 'https://custom.browserless.io';
it('should call fetch with the base URL and content path', async () => {
const originalEnv = { ...process.env };
process.env.BROWSERLESS_TOKEN = 'test-token';
process.env.BROWSERLESS_URL = customUrl;
global.fetch = vi.fn().mockImplementation((url) => {
expect(url).toContain(customUrl);
// BASE_URL is captured at module load time, so we verify fetch is called with /content path
expect(url).toContain('/content');
return Promise.resolve({
ok: true,
status: 200,
statusText: 'OK',
text: () => Promise.resolve('<html><title>Test</title></html>'),
});
});

View file

@ -1,5 +1,6 @@
import { beforeEach, describe, expect, it, vi } from 'vitest';
import { createMockResponse } from '../../test-utils';
import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../../utils/errorType';
import { exa } from '../exa';
@ -18,23 +19,20 @@ describe('exa crawler', () => {
it('should successfully crawl content with API key', async () => {
process.env.EXA_API_KEY = 'test-api-key';
const mockResponse = {
ok: true,
json: vi.fn().mockResolvedValue({
requestId: 'test-request-id',
results: [
{
id: 'test-id',
title: 'Test Article',
url: 'https://example.com',
text: 'This is a test article with enough content to pass the length check. '.repeat(3),
author: 'Test Author',
publishedDate: '2023-01-01',
summary: 'Test summary',
},
],
}),
};
const mockResponse = createMockResponse({
requestId: 'test-request-id',
results: [
{
id: 'test-id',
title: 'Test Article',
url: 'https://example.com',
text: 'This is a test article with enough content to pass the length check. '.repeat(3),
author: 'Test Author',
publishedDate: '2023-01-01',
summary: 'Test summary',
},
],
});
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
@ -51,23 +49,20 @@ describe('exa crawler', () => {
url: 'https://example.com',
});
expect(withTimeout).toHaveBeenCalledWith(expect.any(Promise), 30000);
expect(withTimeout).toHaveBeenCalledWith(expect.any(Function), 30000);
});
it('should handle missing API key', async () => {
// API key is undefined
const mockResponse = {
ok: true,
json: vi.fn().mockResolvedValue({
results: [
{
title: 'Test Article',
url: 'https://example.com',
text: 'Test content with sufficient length. '.repeat(5),
},
],
}),
};
const mockResponse = createMockResponse({
results: [
{
title: 'Test Article',
url: 'https://example.com',
text: 'Test content with sufficient length. '.repeat(5),
},
],
});
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
@ -75,19 +70,16 @@ describe('exa crawler', () => {
await exa('https://example.com', { filterOptions: {} });
// Check that fetch was called with empty API key header
expect(withTimeout).toHaveBeenCalledWith(expect.any(Promise), 30000);
expect(withTimeout).toHaveBeenCalledWith(expect.any(Function), 30000);
});
it('should return undefined when no results are returned', async () => {
process.env.EXA_API_KEY = 'test-api-key';
const mockResponse = {
ok: true,
json: vi.fn().mockResolvedValue({
requestId: 'test-request-id',
results: [],
}),
};
const mockResponse = createMockResponse({
requestId: 'test-request-id',
results: [],
});
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
@ -108,18 +100,15 @@ describe('exa crawler', () => {
it('should return undefined for short content', async () => {
process.env.EXA_API_KEY = 'test-api-key';
const mockResponse = {
ok: true,
json: vi.fn().mockResolvedValue({
results: [
{
title: 'Test Article',
url: 'https://example.com',
text: 'Short', // Content too short
},
],
}),
};
const mockResponse = createMockResponse({
results: [
{
title: 'Test Article',
url: 'https://example.com',
text: 'Short', // Content too short
},
],
});
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
@ -132,11 +121,11 @@ describe('exa crawler', () => {
it('should throw PageNotFoundError for 404 status', async () => {
process.env.EXA_API_KEY = 'test-api-key';
const mockResponse = {
const mockResponse = createMockResponse('Not Found', {
ok: false,
status: 404,
statusText: 'Not Found',
};
});
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
@ -149,11 +138,11 @@ describe('exa crawler', () => {
it('should throw error for other HTTP errors', async () => {
process.env.EXA_API_KEY = 'test-api-key';
const mockResponse = {
const mockResponse = createMockResponse('', {
ok: false,
status: 500,
statusText: 'Internal Server Error',
};
});
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
@ -167,7 +156,7 @@ describe('exa crawler', () => {
process.env.EXA_API_KEY = 'test-api-key';
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockRejectedValue(new Error('fetch failed'));
vi.mocked(withTimeout).mockRejectedValue(new TypeError('fetch failed'));
await expect(exa('https://example.com', { filterOptions: {} })).rejects.toThrow(
NetworkConnectionError,
@ -198,42 +187,37 @@ describe('exa crawler', () => {
);
});
it('should return undefined when JSON parsing fails', async () => {
it('should throw ResponseBodyParseError when JSON parsing fails', async () => {
process.env.EXA_API_KEY = 'test-api-key';
const mockResponse = {
ok: true,
const mockResponse = createMockResponse('not json', { ok: true });
mockResponse.json = vi.fn().mockRejectedValue(new Error('Invalid JSON'));
mockResponse.clone.mockReturnValue({
...mockResponse,
json: vi.fn().mockRejectedValue(new Error('Invalid JSON')),
};
text: vi.fn().mockResolvedValue('not json'),
});
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
const consoleSpy = vi.spyOn(console, 'error').mockImplementation(() => {});
const result = await exa('https://example.com', { filterOptions: {} });
expect(result).toBeUndefined();
expect(consoleSpy).toHaveBeenCalled();
consoleSpy.mockRestore();
await expect(exa('https://example.com', { filterOptions: {} })).rejects.toThrow(
'Exa returned non-JSON response: not json',
);
});
it('should use result URL when available', async () => {
process.env.EXA_API_KEY = 'test-api-key';
const mockResponse = {
ok: true,
json: vi.fn().mockResolvedValue({
results: [
{
title: 'Test Article',
url: 'https://redirected.example.com',
text: 'Test content with sufficient length. '.repeat(5),
},
],
}),
};
const mockResponse = createMockResponse({
results: [
{
title: 'Test Article',
url: 'https://redirected.example.com',
text: 'Test content with sufficient length. '.repeat(5),
},
],
});
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
@ -246,18 +230,15 @@ describe('exa crawler', () => {
it('should fallback to original URL when result URL is missing', async () => {
process.env.EXA_API_KEY = 'test-api-key';
const mockResponse = {
ok: true,
json: vi.fn().mockResolvedValue({
results: [
{
title: 'Test Article',
text: 'Test content with sufficient length. '.repeat(5),
// url is missing
},
],
}),
};
const mockResponse = createMockResponse({
results: [
{
title: 'Test Article',
text: 'Test content with sufficient length. '.repeat(5),
// url is missing
},
],
});
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);

View file

@ -1,5 +1,6 @@
import { beforeEach, describe, expect, it, vi } from 'vitest';
import { createMockResponse } from '../../test-utils';
import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../../utils/errorType';
import { firecrawl } from '../firecrawl';
@ -19,25 +20,23 @@ describe('firecrawl crawler', () => {
it('should successfully crawl content with API key', async () => {
process.env.FIRECRAWL_API_KEY = 'test-api-key';
const mockResponse = {
ok: true,
json: vi.fn().mockResolvedValue({
success: true,
data: {
markdown:
'This is a test markdown content with enough length to pass validation. '.repeat(3),
metadata: {
title: 'Test Article',
description: 'Test description',
sourceURL: 'https://example.com',
statusCode: 200,
language: 'en',
keywords: 'test',
robots: 'index',
},
const mockResponse = createMockResponse({
success: true,
data: {
markdown: 'This is a test markdown content with enough length to pass validation. '.repeat(
3,
),
metadata: {
title: 'Test Article',
description: 'Test description',
sourceURL: 'https://example.com',
statusCode: 200,
language: 'en',
keywords: 'test',
robots: 'index',
},
}),
};
},
});
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
@ -55,58 +54,52 @@ describe('firecrawl crawler', () => {
url: 'https://example.com',
});
expect(withTimeout).toHaveBeenCalledWith(expect.any(Promise), 30000);
expect(withTimeout).toHaveBeenCalledWith(expect.any(Function), 30000);
});
it('should handle missing API key', async () => {
const mockResponse = {
ok: true,
json: vi.fn().mockResolvedValue({
success: true,
data: {
markdown: 'Test content with sufficient length. '.repeat(5),
metadata: {
title: 'Test',
description: 'Test',
sourceURL: 'https://example.com',
statusCode: 200,
language: 'en',
keywords: 'test',
robots: 'index',
},
const mockResponse = createMockResponse({
success: true,
data: {
markdown: 'Test content with sufficient length. '.repeat(5),
metadata: {
title: 'Test',
description: 'Test',
sourceURL: 'https://example.com',
statusCode: 200,
language: 'en',
keywords: 'test',
robots: 'index',
},
}),
};
},
});
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
await firecrawl('https://example.com', { filterOptions: {} });
expect(withTimeout).toHaveBeenCalledWith(expect.any(Promise), 30000);
expect(withTimeout).toHaveBeenCalledWith(expect.any(Function), 30000);
});
it('should return undefined for short content', async () => {
process.env.FIRECRAWL_API_KEY = 'test-api-key';
const mockResponse = {
ok: true,
json: vi.fn().mockResolvedValue({
success: true,
data: {
markdown: 'Short', // Content too short
metadata: {
title: 'Test',
description: 'Test',
sourceURL: 'https://example.com',
statusCode: 200,
language: 'en',
keywords: 'test',
robots: 'index',
},
const mockResponse = createMockResponse({
success: true,
data: {
markdown: 'Short', // Content too short
metadata: {
title: 'Test',
description: 'Test',
sourceURL: 'https://example.com',
statusCode: 200,
language: 'en',
keywords: 'test',
robots: 'index',
},
}),
};
},
});
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
@ -119,24 +112,21 @@ describe('firecrawl crawler', () => {
it('should return undefined when markdown is missing', async () => {
process.env.FIRECRAWL_API_KEY = 'test-api-key';
const mockResponse = {
ok: true,
json: vi.fn().mockResolvedValue({
success: true,
data: {
// markdown is missing
metadata: {
title: 'Test',
description: 'Test',
sourceURL: 'https://example.com',
statusCode: 200,
language: 'en',
keywords: 'test',
robots: 'index',
},
const mockResponse = createMockResponse({
success: true,
data: {
// markdown is missing
metadata: {
title: 'Test',
description: 'Test',
sourceURL: 'https://example.com',
statusCode: 200,
language: 'en',
keywords: 'test',
robots: 'index',
},
}),
};
},
});
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
@ -149,11 +139,11 @@ describe('firecrawl crawler', () => {
it('should throw PageNotFoundError for 404 status', async () => {
process.env.FIRECRAWL_API_KEY = 'test-api-key';
const mockResponse = {
const mockResponse = createMockResponse('Not Found', {
ok: false,
status: 404,
statusText: 'Not Found',
};
});
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
@ -166,11 +156,11 @@ describe('firecrawl crawler', () => {
it('should throw error for other HTTP errors', async () => {
process.env.FIRECRAWL_API_KEY = 'test-api-key';
const mockResponse = {
const mockResponse = createMockResponse('', {
ok: false,
status: 500,
statusText: 'Internal Server Error',
};
});
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
@ -184,7 +174,7 @@ describe('firecrawl crawler', () => {
process.env.FIRECRAWL_API_KEY = 'test-api-key';
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockRejectedValue(new Error('fetch failed'));
vi.mocked(withTimeout).mockRejectedValue(new TypeError('fetch failed'));
await expect(firecrawl('https://example.com', { filterOptions: {} })).rejects.toThrow(
NetworkConnectionError,
@ -217,54 +207,49 @@ describe('firecrawl crawler', () => {
);
});
it('should return undefined when JSON parsing fails', async () => {
it('should throw ResponseBodyParseError when JSON parsing fails', async () => {
process.env.FIRECRAWL_API_KEY = 'test-api-key';
const mockResponse = {
ok: true,
const mockResponse = createMockResponse('not json', { ok: true });
mockResponse.json = vi.fn().mockRejectedValue(new Error('Invalid JSON'));
mockResponse.clone.mockReturnValue({
...mockResponse,
json: vi.fn().mockRejectedValue(new Error('Invalid JSON')),
};
text: vi.fn().mockResolvedValue('not json'),
});
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
const consoleSpy = vi.spyOn(console, 'error').mockImplementation(() => {});
const result = await firecrawl('https://example.com', { filterOptions: {} });
expect(result).toBeUndefined();
expect(consoleSpy).toHaveBeenCalled();
consoleSpy.mockRestore();
await expect(firecrawl('https://example.com', { filterOptions: {} })).rejects.toThrow(
'Firecrawl returned non-JSON response: not json',
);
});
it('should handle metadata with all optional fields', async () => {
process.env.FIRECRAWL_API_KEY = 'test-api-key';
const mockResponse = {
ok: true,
json: vi.fn().mockResolvedValue({
success: true,
data: {
markdown: 'Complete test content with all metadata fields provided. '.repeat(3),
metadata: {
title: 'Complete Test Article',
description: 'Complete test description',
keywords: 'test,complete,article',
language: 'en',
ogDescription: 'OG description',
ogImage: 'https://example.com/image.jpg',
ogLocaleAlternate: ['en-US', 'fr-FR'],
ogSiteName: 'Example Site',
ogTitle: 'OG Title',
ogUrl: 'https://example.com/og',
robots: 'index,follow',
statusCode: 200,
sourceURL: 'https://example.com',
},
const mockResponse = createMockResponse({
success: true,
data: {
markdown: 'Complete test content with all metadata fields provided. '.repeat(3),
metadata: {
title: 'Complete Test Article',
description: 'Complete test description',
keywords: 'test,complete,article',
language: 'en',
ogDescription: 'OG description',
ogImage: 'https://example.com/image.jpg',
ogLocaleAlternate: ['en-US', 'fr-FR'],
ogSiteName: 'Example Site',
ogTitle: 'OG Title',
ogUrl: 'https://example.com/og',
robots: 'index,follow',
statusCode: 200,
sourceURL: 'https://example.com',
},
}),
};
},
});
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);

View file

@ -1,29 +1,44 @@
import { beforeEach, describe, expect, it, vi } from 'vitest';
import { createMockResponse } from '../../test-utils';
import * as withTimeoutModule from '../../utils/withTimeout';
import { jina } from '../jina';
// Mock withTimeout to just call the factory function directly (bypassing real timeout)
vi.spyOn(withTimeoutModule, 'withTimeout').mockImplementation((fn) =>
fn(new AbortController().signal),
);
describe('jina crawler', () => {
const mockFetch = vi.fn();
global.fetch = mockFetch;
beforeEach(() => {
vi.resetAllMocks();
// Re-apply the withTimeout spy after resetAllMocks
vi.spyOn(withTimeoutModule, 'withTimeout').mockImplementation((fn) =>
fn(new AbortController().signal),
);
});
it('should crawl url successfully', async () => {
const mockResponse = {
ok: true,
json: () =>
Promise.resolve({
code: 200,
data: {
content: 'test content',
description: 'test description',
siteName: 'test site',
title: 'test title',
},
}),
};
const testContent =
'This is a test content that is long enough to pass the minimum length validation check. '.repeat(
2,
);
const mockResponse = createMockResponse(
{
code: 200,
data: {
content: testContent,
description: 'test description',
siteName: 'test site',
title: 'test title',
},
},
{ ok: true },
);
mockFetch.mockResolvedValue(mockResponse);
@ -38,13 +53,14 @@ describe('jina crawler', () => {
'Authorization': 'Bearer test-key',
'x-send-from': 'LobeChat Community',
},
signal: expect.any(AbortSignal),
});
expect(result).toEqual({
content: 'test content',
content: testContent,
contentType: 'text',
description: 'test description',
length: 12,
length: testContent.length,
siteName: 'test site',
title: 'test title',
url: 'https://example.com',
@ -54,16 +70,15 @@ describe('jina crawler', () => {
it('should use JINA_READER_API_KEY from env if apiKey not provided', async () => {
process.env.JINA_READER_API_KEY = 'env-reader-key';
const mockResponse = {
ok: true,
json: () =>
Promise.resolve({
code: 200,
data: {
content: 'test content',
},
}),
};
const mockResponse = createMockResponse(
{
code: 200,
data: {
content: 'test content',
},
},
{ ok: true },
);
mockFetch.mockResolvedValue(mockResponse);
@ -75,6 +90,7 @@ describe('jina crawler', () => {
'Authorization': 'Bearer env-reader-key',
'x-send-from': 'LobeChat Community',
},
signal: expect.any(AbortSignal),
});
delete process.env.JINA_READER_API_KEY;
@ -83,16 +99,15 @@ describe('jina crawler', () => {
it('should use JINA_API_KEY from env if apiKey and JINA_READER_API_KEY not provided', async () => {
process.env.JINA_API_KEY = 'env-key';
const mockResponse = {
ok: true,
json: () =>
Promise.resolve({
code: 200,
data: {
content: 'test content',
},
}),
};
const mockResponse = createMockResponse(
{
code: 200,
data: {
content: 'test content',
},
},
{ ok: true },
);
mockFetch.mockResolvedValue(mockResponse);
@ -104,22 +119,22 @@ describe('jina crawler', () => {
'Authorization': 'Bearer env-key',
'x-send-from': 'LobeChat Community',
},
signal: expect.any(AbortSignal),
});
delete process.env.JINA_API_KEY;
});
it('should send empty Authorization header if no api key provided', async () => {
const mockResponse = {
ok: true,
json: () =>
Promise.resolve({
code: 200,
data: {
content: 'test content',
},
}),
};
const mockResponse = createMockResponse(
{
code: 200,
data: {
content: 'test content',
},
},
{ ok: true },
);
mockFetch.mockResolvedValue(mockResponse);
@ -131,11 +146,14 @@ describe('jina crawler', () => {
'Authorization': '',
'x-send-from': 'LobeChat Community',
},
signal: expect.any(AbortSignal),
});
});
it('should return undefined if response is not ok', async () => {
mockFetch.mockResolvedValue({ ok: false });
mockFetch.mockResolvedValue(
createMockResponse(null, { ok: false, status: 500, statusText: 'Internal Server Error' }),
);
const result = await jina('https://example.com', { filterOptions: {} });
@ -143,14 +161,13 @@ describe('jina crawler', () => {
});
it('should return undefined if response code is not 200', async () => {
const mockResponse = {
ok: true,
json: () =>
Promise.resolve({
code: 400,
message: 'Bad Request',
}),
};
const mockResponse = createMockResponse(
{
code: 400,
message: 'Bad Request',
},
{ ok: true },
);
mockFetch.mockResolvedValue(mockResponse);
@ -159,11 +176,11 @@ describe('jina crawler', () => {
expect(result).toBeUndefined();
});
it('should return undefined if fetch throws error', async () => {
it('should throw error if fetch throws non-fetch-failed error', async () => {
mockFetch.mockRejectedValue(new Error('Network error'));
const result = await jina('https://example.com', { filterOptions: {} });
expect(result).toBeUndefined();
await expect(jina('https://example.com', { filterOptions: {} })).rejects.toThrow(
'Network error',
);
});
});

View file

@ -22,9 +22,10 @@ describe('naive crawler', () => {
vi.clearAllMocks();
});
it('should return undefined for normal pages (due to cloudflare logic)', async () => {
it('should return content for normal pages', async () => {
const mockResponse = {
status: 200,
ok: true,
headers: new Map([['content-type', 'text/html']]),
text: vi.fn().mockResolvedValue('<html><body>Test content</body></html>'),
};
@ -34,8 +35,8 @@ describe('naive crawler', () => {
const { htmlToMarkdown } = await import('../../utils/htmlToMarkdown');
vi.mocked(htmlToMarkdown).mockReturnValue({
content: 'Test content'.padEnd(101, ' '), // Ensure length > 100
title: 'Normal Page Title', // Not "Just a moment..." so it returns undefined
content: 'Test content'.padEnd(101, ' '),
title: 'Normal Page Title',
description: 'Test description',
siteName: 'Test Site',
length: 101,
@ -43,13 +44,22 @@ describe('naive crawler', () => {
const result = await naive('https://example.com', { filterOptions: {} });
expect(result).toBeUndefined();
expect(result).toEqual({
content: 'Test content'.padEnd(101, ' '),
contentType: 'text',
description: 'Test description',
length: 101,
siteName: 'Test Site',
title: 'Normal Page Title',
url: 'https://example.com',
});
});
it('should successfully crawl JSON content', async () => {
const mockJsonData = { message: 'Hello world', data: [1, 2, 3] };
const mockResponse = {
status: 200,
ok: true,
headers: new Map([['content-type', 'application/json']]),
clone: () => ({
json: vi.fn().mockResolvedValue(mockJsonData),
@ -74,6 +84,7 @@ describe('naive crawler', () => {
const mockText = '{"invalid": json}';
const mockResponse = {
status: 200,
ok: true,
headers: new Map([['content-type', 'application/json']]),
clone: () => ({
json: vi.fn().mockRejectedValue(new Error('Invalid JSON')),
@ -97,6 +108,7 @@ describe('naive crawler', () => {
it('should return undefined for short content', async () => {
const mockResponse = {
status: 200,
ok: true,
headers: new Map([['content-type', 'text/html']]),
text: vi.fn().mockResolvedValue('<html><body>Short</body></html>'),
};
@ -116,9 +128,10 @@ describe('naive crawler', () => {
expect(result).toBeUndefined();
});
it('should return content when NOT blocked by Cloudflare', async () => {
it('should return undefined when blocked by Cloudflare', async () => {
const mockResponse = {
status: 200,
ok: true,
headers: new Map([['content-type', 'text/html']]),
text: vi.fn().mockResolvedValue('<html><body>Normal content</body></html>'),
};
@ -129,7 +142,7 @@ describe('naive crawler', () => {
const { htmlToMarkdown } = await import('../../utils/htmlToMarkdown');
vi.mocked(htmlToMarkdown).mockReturnValue({
content: 'Test content'.padEnd(101, ' '),
title: 'Just a moment...', // Cloudflare blocking page - this will cause return
title: 'Just a moment...', // Cloudflare blocking page
description: 'Test description',
siteName: 'Test Site',
length: 101,
@ -137,15 +150,21 @@ describe('naive crawler', () => {
const result = await naive('https://example.com', { filterOptions: {} });
expect(result).toEqual({
content: 'Test content'.padEnd(101, ' '),
contentType: 'text',
description: 'Test description',
length: 101,
siteName: 'Test Site',
title: 'Just a moment...',
url: 'https://example.com',
});
expect(result).toBeUndefined();
});
it('should throw error for non-ok status codes', async () => {
const mockResponse = {
status: 500,
ok: false,
statusText: 'Internal Server Error',
text: vi.fn().mockResolvedValue('Server Error'),
};
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
await expect(naive('https://example.com', { filterOptions: {} })).rejects.toThrow(/500/);
});
it('should throw PageNotFoundError for 404 status', async () => {
@ -164,7 +183,7 @@ describe('naive crawler', () => {
it('should throw NetworkConnectionError for fetch failures', async () => {
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockRejectedValue(new Error('fetch failed'));
vi.mocked(withTimeout).mockRejectedValue(new TypeError('fetch failed'));
await expect(naive('https://example.com', { filterOptions: {} })).rejects.toThrow(
NetworkConnectionError,
@ -194,6 +213,7 @@ describe('naive crawler', () => {
it('should return undefined when HTML processing fails', async () => {
const mockResponse = {
status: 200,
ok: true,
headers: new Map([['content-type', 'text/html']]),
text: vi.fn().mockRejectedValue(new Error('Failed to read text')),
};
@ -209,6 +229,7 @@ describe('naive crawler', () => {
it('should pass filter options to htmlToMarkdown', async () => {
const mockResponse = {
status: 200,
ok: true,
headers: new Map([['content-type', 'text/html']]),
text: vi.fn().mockResolvedValue('<html><body>Test content</body></html>'),
};

View file

@ -1,5 +1,6 @@
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
import { createMockResponse } from '../../test-utils';
import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../../utils/errorType';
import * as withTimeoutModule from '../../utils/withTimeout';
import { search1api } from '../search1api';
@ -17,8 +18,10 @@ describe('search1api crawler', () => {
originalEnv = { ...process.env };
process.env.SEARCH1API_API_KEY = 'test-api-key';
// Mock withTimeout to directly return the promise
vi.spyOn(withTimeoutModule, 'withTimeout').mockImplementation((promise) => promise);
// Mock withTimeout to call the factory function directly (bypassing real timeout)
vi.spyOn(withTimeoutModule, 'withTimeout').mockImplementation((fn) =>
fn(new AbortController().signal),
);
});
afterEach(() => {
@ -26,7 +29,7 @@ describe('search1api crawler', () => {
});
it('should throw NetworkConnectionError when fetch fails', async () => {
mockFetch.mockRejectedValue(new Error('fetch failed'));
mockFetch.mockRejectedValue(new TypeError('fetch failed'));
await expect(search1api('https://example.com', { filterOptions: {} })).rejects.toThrow(
NetworkConnectionError,
@ -48,11 +51,13 @@ describe('search1api crawler', () => {
});
it('should throw PageNotFoundError when status is 404', async () => {
mockFetch.mockResolvedValue({
ok: false,
status: 404,
statusText: 'Not Found',
});
mockFetch.mockResolvedValue(
createMockResponse('Not Found', {
ok: false,
status: 404,
statusText: 'Not Found',
}),
);
await expect(search1api('https://example.com', { filterOptions: {} })).rejects.toThrow(
PageNotFoundError,
@ -60,11 +65,13 @@ describe('search1api crawler', () => {
});
it('should throw error for other failed responses', async () => {
mockFetch.mockResolvedValue({
ok: false,
status: 500,
statusText: 'Internal Server Error',
});
mockFetch.mockResolvedValue(
createMockResponse('', {
ok: false,
status: 500,
statusText: 'Internal Server Error',
}),
);
await expect(search1api('https://example.com', { filterOptions: {} })).rejects.toThrow(
'Search1API request failed with status 500: Internal Server Error',
@ -72,18 +79,19 @@ describe('search1api crawler', () => {
});
it('should return undefined when content is too short', async () => {
mockFetch.mockResolvedValue({
ok: true,
json: () =>
Promise.resolve({
mockFetch.mockResolvedValue(
createMockResponse(
{
crawlParameters: { url: 'https://example.com' },
results: {
title: 'Test Title',
link: 'https://example.com',
content: 'Short', // Less than 100 characters
},
}),
});
},
{ ok: true },
),
);
const result = await search1api('https://example.com', { filterOptions: {} });
expect(result).toBeUndefined();
@ -92,18 +100,19 @@ describe('search1api crawler', () => {
it('should return crawl result on successful fetch', async () => {
const mockContent = 'This is a test content that is longer than 100 characters. '.repeat(3);
mockFetch.mockResolvedValue({
ok: true,
json: () =>
Promise.resolve({
mockFetch.mockResolvedValue(
createMockResponse(
{
crawlParameters: { url: 'https://example.com' },
results: {
title: 'Test Title',
link: 'https://example.com',
content: mockContent,
},
}),
});
},
{ ok: true },
),
);
const result = await search1api('https://example.com', { filterOptions: {} });
@ -116,6 +125,7 @@ describe('search1api crawler', () => {
body: JSON.stringify({
url: 'https://example.com',
}),
signal: expect.any(AbortSignal),
});
expect(result).toEqual({
@ -130,12 +140,18 @@ describe('search1api crawler', () => {
});
it('should handle JSON parse errors', async () => {
mockFetch.mockResolvedValue({
ok: true,
json: () => Promise.reject(new Error('Invalid JSON')),
});
mockFetch.mockResolvedValue(createMockResponse('invalid json', { ok: true }));
// Override json to reject for this specific test
const response = createMockResponse('invalid json', { ok: true });
response.json = () => Promise.reject(new Error('Invalid JSON'));
// clone should also return a response whose text() works for error reporting
response.clone = () => {
const cloned = createMockResponse('invalid json', { ok: true });
cloned.json = () => Promise.reject(new Error('Invalid JSON'));
return cloned;
};
mockFetch.mockResolvedValue(response);
const result = await search1api('https://example.com', { filterOptions: {} });
expect(result).toBeUndefined();
await expect(search1api('https://example.com', { filterOptions: {} })).rejects.toThrow();
});
});

View file

@ -1,5 +1,6 @@
import { beforeEach, describe, expect, it, vi } from 'vitest';
import { createMockResponse } from '../../test-utils';
import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../../utils/errorType';
import { tavily } from '../tavily';
@ -19,21 +20,18 @@ describe('tavily crawler', () => {
it('should successfully crawl content with API key', async () => {
process.env.TAVILY_API_KEY = 'test-api-key';
const mockResponse = {
ok: true,
json: vi.fn().mockResolvedValue({
base_url: 'https://api.tavily.com',
response_time: 1.5,
results: [
{
url: 'https://example.com',
raw_content:
'This is a test raw content with sufficient length to pass validation. '.repeat(3),
images: ['https://example.com/image1.jpg', 'https://example.com/image2.jpg'],
},
],
}),
};
const mockResponse = createMockResponse({
base_url: 'https://api.tavily.com',
response_time: 1.5,
results: [
{
url: 'https://example.com',
raw_content:
'This is a test raw content with sufficient length to pass validation. '.repeat(3),
images: ['https://example.com/image1.jpg', 'https://example.com/image2.jpg'],
},
],
});
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
@ -50,69 +48,60 @@ describe('tavily crawler', () => {
url: 'https://example.com',
});
expect(withTimeout).toHaveBeenCalledWith(expect.any(Promise), 30000);
expect(withTimeout).toHaveBeenCalledWith(expect.any(Function), 30000);
});
it('should use custom extract depth when provided', async () => {
process.env.TAVILY_API_KEY = 'test-api-key';
process.env.TAVILY_EXTRACT_DEPTH = 'advanced';
const mockResponse = {
ok: true,
json: vi.fn().mockResolvedValue({
base_url: 'https://api.tavily.com',
response_time: 2.1,
results: [
{
url: 'https://example.com',
raw_content: 'Advanced extraction content with more details. '.repeat(5),
},
],
}),
};
const mockResponse = createMockResponse({
base_url: 'https://api.tavily.com',
response_time: 2.1,
results: [
{
url: 'https://example.com',
raw_content: 'Advanced extraction content with more details. '.repeat(5),
},
],
});
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
await tavily('https://example.com', { filterOptions: {} });
expect(withTimeout).toHaveBeenCalledWith(expect.any(Promise), 30000);
expect(withTimeout).toHaveBeenCalledWith(expect.any(Function), 30000);
});
it('should handle missing API key', async () => {
const mockResponse = {
ok: true,
json: vi.fn().mockResolvedValue({
base_url: 'https://api.tavily.com',
response_time: 1.2,
results: [
{
url: 'https://example.com',
raw_content: 'Test content with sufficient length. '.repeat(5),
},
],
}),
};
const mockResponse = createMockResponse({
base_url: 'https://api.tavily.com',
response_time: 1.2,
results: [
{
url: 'https://example.com',
raw_content: 'Test content with sufficient length. '.repeat(5),
},
],
});
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
await tavily('https://example.com', { filterOptions: {} });
expect(withTimeout).toHaveBeenCalledWith(expect.any(Promise), 30000);
expect(withTimeout).toHaveBeenCalledWith(expect.any(Function), 30000);
});
it('should return undefined when no results are returned', async () => {
process.env.TAVILY_API_KEY = 'test-api-key';
const mockResponse = {
ok: true,
json: vi.fn().mockResolvedValue({
base_url: 'https://api.tavily.com',
response_time: 0.8,
results: [],
}),
};
const mockResponse = createMockResponse({
base_url: 'https://api.tavily.com',
response_time: 0.8,
results: [],
});
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
@ -133,19 +122,16 @@ describe('tavily crawler', () => {
it('should return undefined for short content', async () => {
process.env.TAVILY_API_KEY = 'test-api-key';
const mockResponse = {
ok: true,
json: vi.fn().mockResolvedValue({
base_url: 'https://api.tavily.com',
response_time: 1.1,
results: [
{
url: 'https://example.com',
raw_content: 'Short', // Content too short
},
],
}),
};
const mockResponse = createMockResponse({
base_url: 'https://api.tavily.com',
response_time: 1.1,
results: [
{
url: 'https://example.com',
raw_content: 'Short', // Content too short
},
],
});
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
@ -158,20 +144,17 @@ describe('tavily crawler', () => {
it('should return undefined when raw_content is missing', async () => {
process.env.TAVILY_API_KEY = 'test-api-key';
const mockResponse = {
ok: true,
json: vi.fn().mockResolvedValue({
base_url: 'https://api.tavily.com',
response_time: 1,
results: [
{
url: 'https://example.com',
// raw_content is missing
images: ['https://example.com/image.jpg'],
},
],
}),
};
const mockResponse = createMockResponse({
base_url: 'https://api.tavily.com',
response_time: 1,
results: [
{
url: 'https://example.com',
// raw_content is missing
images: ['https://example.com/image.jpg'],
},
],
});
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
@ -184,11 +167,11 @@ describe('tavily crawler', () => {
it('should throw PageNotFoundError for 404 status', async () => {
process.env.TAVILY_API_KEY = 'test-api-key';
const mockResponse = {
const mockResponse = createMockResponse('Not Found', {
ok: false,
status: 404,
statusText: 'Not Found',
};
});
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
@ -201,11 +184,11 @@ describe('tavily crawler', () => {
it('should throw error for other HTTP errors', async () => {
process.env.TAVILY_API_KEY = 'test-api-key';
const mockResponse = {
const mockResponse = createMockResponse('', {
ok: false,
status: 500,
statusText: 'Internal Server Error',
};
});
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
@ -219,7 +202,7 @@ describe('tavily crawler', () => {
process.env.TAVILY_API_KEY = 'test-api-key';
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockRejectedValue(new Error('fetch failed'));
vi.mocked(withTimeout).mockRejectedValue(new TypeError('fetch failed'));
await expect(tavily('https://example.com', { filterOptions: {} })).rejects.toThrow(
NetworkConnectionError,
@ -252,43 +235,38 @@ describe('tavily crawler', () => {
);
});
it('should return undefined when JSON parsing fails', async () => {
it('should throw ResponseBodyParseError when JSON parsing fails', async () => {
process.env.TAVILY_API_KEY = 'test-api-key';
const mockResponse = {
ok: true,
const mockResponse = createMockResponse('not json', { ok: true });
mockResponse.json = vi.fn().mockRejectedValue(new Error('Invalid JSON'));
mockResponse.clone.mockReturnValue({
...mockResponse,
json: vi.fn().mockRejectedValue(new Error('Invalid JSON')),
};
text: vi.fn().mockResolvedValue('not json'),
});
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
const consoleSpy = vi.spyOn(console, 'error').mockImplementation(() => {});
const result = await tavily('https://example.com', { filterOptions: {} });
expect(result).toBeUndefined();
expect(consoleSpy).toHaveBeenCalled();
consoleSpy.mockRestore();
await expect(tavily('https://example.com', { filterOptions: {} })).rejects.toThrow(
'Tavily returned non-JSON response: not json',
);
});
it('should use result URL when available', async () => {
process.env.TAVILY_API_KEY = 'test-api-key';
const mockResponse = {
ok: true,
json: vi.fn().mockResolvedValue({
base_url: 'https://api.tavily.com',
response_time: 1.3,
results: [
{
url: 'https://redirected.example.com',
raw_content: 'Test content with sufficient length. '.repeat(5),
},
],
}),
};
const mockResponse = createMockResponse({
base_url: 'https://api.tavily.com',
response_time: 1.3,
results: [
{
url: 'https://redirected.example.com',
raw_content: 'Test content with sufficient length. '.repeat(5),
},
],
});
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
@ -301,19 +279,16 @@ describe('tavily crawler', () => {
it('should fallback to original URL when result URL is missing', async () => {
process.env.TAVILY_API_KEY = 'test-api-key';
const mockResponse = {
ok: true,
json: vi.fn().mockResolvedValue({
base_url: 'https://api.tavily.com',
response_time: 1.4,
results: [
{
raw_content: 'Test content with sufficient length. '.repeat(5),
// url is missing
},
],
}),
};
const mockResponse = createMockResponse({
base_url: 'https://api.tavily.com',
response_time: 1.4,
results: [
{
raw_content: 'Test content with sufficient length. '.repeat(5),
// url is missing
},
],
});
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
@ -326,20 +301,17 @@ describe('tavily crawler', () => {
it('should handle failed results in response', async () => {
process.env.TAVILY_API_KEY = 'test-api-key';
const mockResponse = {
ok: true,
json: vi.fn().mockResolvedValue({
base_url: 'https://api.tavily.com',
response_time: 1.6,
results: [],
failed_results: [
{
url: 'https://example.com',
error: 'Page not accessible',
},
],
}),
};
const mockResponse = createMockResponse({
base_url: 'https://api.tavily.com',
response_time: 1.6,
results: [],
failed_results: [
{
url: 'https://example.com',
error: 'Page not accessible',
},
],
});
const { withTimeout } = await import('../../utils/withTimeout');
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);

View file

@ -2,7 +2,10 @@ import qs from 'query-string';
import urlJoin from 'url-join';
import type { CrawlImpl, CrawlSuccessResult } from '../type';
import { PageNotFoundError, toFetchError } from '../utils/errorType';
import { htmlToMarkdown } from '../utils/htmlToMarkdown';
import { createHTTPStatusError } from '../utils/response';
import { DEFAULT_TIMEOUT, withTimeout } from '../utils/withTimeout';
const BASE_URL = process.env.BROWSERLESS_URL ?? 'https://chrome.browserless.io';
// Allowed file types: html, css, js, json, xml, webmanifest, txt, md
@ -31,46 +34,62 @@ export const browserless: CrawlImpl = async (url, { filterOptions }) => {
url,
};
let res: Response;
try {
const res = await fetch(
qs.stringifyUrl({
query: {
blockAds: BROWSERLESS_BLOCK_ADS,
launch: JSON.stringify({ stealth: BROWSERLESS_STEALTH_MODE }),
token: BROWSERLESS_TOKEN,
},
url: urlJoin(BASE_URL, '/content'),
}),
{
body: JSON.stringify(input),
headers: {
'Content-Type': 'application/json',
},
method: 'POST',
},
res = await withTimeout(
(signal) =>
fetch(
qs.stringifyUrl({
query: {
blockAds: BROWSERLESS_BLOCK_ADS,
launch: JSON.stringify({ stealth: BROWSERLESS_STEALTH_MODE }),
token: BROWSERLESS_TOKEN,
},
url: urlJoin(BASE_URL, '/content'),
}),
{
body: JSON.stringify(input),
headers: {
'Content-Type': 'application/json',
},
method: 'POST',
signal,
},
),
DEFAULT_TIMEOUT,
);
const html = await res.text();
} catch (e) {
throw toFetchError(e);
}
const result = htmlToMarkdown(html, { filterOptions, url });
if (
!!result.content &&
result.title &&
// "Just a moment..." indicates being blocked by CloudFlare
result.title.trim() !== 'Just a moment...'
) {
return {
content: result.content,
contentType: 'text',
description: result?.description,
length: result.length,
siteName: result?.siteName,
title: result?.title,
url,
} satisfies CrawlSuccessResult;
if (!res.ok) {
if (res.status === 404) {
throw new PageNotFoundError(res.statusText);
}
} catch (error) {
console.error(error);
throw await createHTTPStatusError(res, 'Browserless');
}
const html = await res.text();
const result = htmlToMarkdown(html, { filterOptions, url });
if (
!!result.content &&
result.content.length > 100 &&
result.title &&
// "Just a moment..." indicates being blocked by CloudFlare
result.title.trim() !== 'Just a moment...'
) {
return {
content: result.content,
contentType: 'text',
description: result?.description,
length: result.length,
siteName: result?.siteName,
title: result?.title,
url,
} satisfies CrawlSuccessResult;
}
return;

View file

@ -1,5 +1,6 @@
import type { CrawlImpl, CrawlSuccessResult } from '../type';
import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../utils/errorType';
import { PageNotFoundError, toFetchError } from '../utils/errorType';
import { createHTTPStatusError, parseJSONResponse } from '../utils/response';
import { DEFAULT_TIMEOUT, withTimeout } from '../utils/withTimeout';
interface ExaResults {
@ -27,31 +28,24 @@ export const exa: CrawlImpl = async (url) => {
try {
res = await withTimeout(
fetch('https://api.exa.ai/contents', {
body: JSON.stringify({
livecrawl: 'fallback', // always, fallback
text: true,
urls: [url],
(signal) =>
fetch('https://api.exa.ai/contents', {
body: JSON.stringify({
livecrawl: 'fallback', // always, fallback
text: true,
urls: [url],
}),
headers: {
'Content-Type': 'application/json',
'x-api-key': !apiKey ? '' : apiKey,
},
method: 'POST',
signal,
}),
headers: {
'Content-Type': 'application/json',
'x-api-key': !apiKey ? '' : apiKey,
},
method: 'POST',
}),
DEFAULT_TIMEOUT,
);
} catch (e) {
const error = e as Error;
if (error.message === 'fetch failed') {
throw new NetworkConnectionError();
}
if (error instanceof TimeoutError) {
throw error;
}
throw e;
throw toFetchError(e);
}
if (!res.ok) {
@ -59,35 +53,29 @@ export const exa: CrawlImpl = async (url) => {
throw new PageNotFoundError(res.statusText);
}
throw new Error(`Exa request failed with status ${res.status}: ${res.statusText}`);
throw await createHTTPStatusError(res, 'Exa');
}
try {
const data = (await res.json()) as ExaResponse;
const data = await parseJSONResponse<ExaResponse>(res, 'Exa');
if (!data.results || data.results.length === 0) {
console.warn('Exa API returned no results for URL:', url);
return;
}
const firstResult = data.results[0];
// Check if content is empty or too short
if (!firstResult.text || firstResult.text.length < 100) {
return;
}
return {
content: firstResult.text,
contentType: 'text',
length: firstResult.text.length,
siteName: new URL(url).hostname,
title: firstResult.title,
url: firstResult.url || url,
} satisfies CrawlSuccessResult;
} catch (error) {
console.error(error);
if (!data.results || data.results.length === 0) {
console.warn('Exa API returned no results for URL:', url);
return;
}
return;
const firstResult = data.results[0];
// Check if content is empty or too short
if (!firstResult.text || firstResult.text.length < 100) {
return;
}
return {
content: firstResult.text,
contentType: 'text',
length: firstResult.text.length,
siteName: new URL(url).hostname,
title: firstResult.title,
url: firstResult.url || url,
} satisfies CrawlSuccessResult;
};

View file

@ -1,5 +1,6 @@
import type { CrawlImpl, CrawlSuccessResult } from '../type';
import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../utils/errorType';
import { PageNotFoundError, toFetchError } from '../utils/errorType';
import { createHTTPStatusError, parseJSONResponse } from '../utils/response';
import { DEFAULT_TIMEOUT, withTimeout } from '../utils/withTimeout';
interface FirecrawlMetadata {
@ -57,30 +58,23 @@ export const firecrawl: CrawlImpl = async (url) => {
try {
res = await withTimeout(
fetch(`${baseUrl}/scrape`, {
body: JSON.stringify({
formats: ['markdown'], // ["markdown", "html"]
url,
(signal) =>
fetch(`${baseUrl}/scrape`, {
body: JSON.stringify({
formats: ['markdown'], // ["markdown", "html"]
url,
}),
headers: {
'Authorization': !apiKey ? '' : `Bearer ${apiKey}`,
'Content-Type': 'application/json',
},
method: 'POST',
signal,
}),
headers: {
'Authorization': !apiKey ? '' : `Bearer ${apiKey}`,
'Content-Type': 'application/json',
},
method: 'POST',
}),
DEFAULT_TIMEOUT,
);
} catch (e) {
const error = e as Error;
if (error.message === 'fetch failed') {
throw new NetworkConnectionError();
}
if (error instanceof TimeoutError) {
throw error;
}
throw e;
throw toFetchError(e);
}
if (!res.ok) {
@ -88,37 +82,34 @@ export const firecrawl: CrawlImpl = async (url) => {
throw new PageNotFoundError(res.statusText);
}
throw new Error(`Firecrawl request failed with status ${res.status}: ${res.statusText}`);
throw await createHTTPStatusError(res, 'Firecrawl');
}
try {
const data = (await res.json()) as FirecrawlResponse;
if (data.data.warning) {
console.warn('[Firecrawl] Warning:', data.data.warning);
}
if (data.data.metadata.error) {
console.error('[Firecrawl] Metadata error:', data.data.metadata.error);
}
// Check if content is empty or too short
if (!data.data.markdown || data.data.markdown.length < 100) {
return;
}
return {
content: data.data.markdown,
contentType: 'text',
description: data.data.metadata.description || '',
length: data.data.markdown.length,
siteName: new URL(url).hostname,
title: data.data.metadata.title || '',
url: url,
} satisfies CrawlSuccessResult;
} catch (error) {
console.error('[Firecrawl] Parse error:', error);
const data = await parseJSONResponse<FirecrawlResponse>(res, 'Firecrawl');
if (!data.data) {
throw new Error('Firecrawl response missing data field');
}
return;
if (data.data.warning) {
console.warn('[Firecrawl] Warning:', data.data.warning);
}
if (data.data.metadata.error) {
console.error('[Firecrawl] Metadata error:', data.data.metadata.error);
}
// Check if content is empty or too short
if (!data.data.markdown || data.data.markdown.length < 100) {
return;
}
return {
content: data.data.markdown,
contentType: 'text',
description: data.data.metadata.description || '',
length: data.data.markdown.length,
siteName: new URL(url).hostname,
title: data.data.metadata.title || '',
url,
} satisfies CrawlSuccessResult;
};

View file

@ -1,37 +1,59 @@
import type { CrawlImpl } from '../type';
import { toFetchError } from '../utils/errorType';
import { parseJSONResponse } from '../utils/response';
import { DEFAULT_TIMEOUT, withTimeout } from '../utils/withTimeout';
export const jina: CrawlImpl<{ apiKey?: string }> = async (url, params) => {
const token = params.apiKey ?? process.env.JINA_READER_API_KEY ?? process.env.JINA_API_KEY;
let res: Response;
try {
const res = await fetch(`https://r.jina.ai/${url}`, {
headers: {
'Accept': 'application/json',
'Authorization': token ? `Bearer ${token}` : '',
'x-send-from': 'LobeChat Community',
},
});
if (res.ok) {
const json = await res.json();
if (json.code === 200) {
const result = json.data;
return {
content: result.content,
contentType: 'text',
description: result?.description,
length: result.content.length,
siteName: result?.siteName,
title: result?.title,
url: url,
};
}
throw json;
}
} catch (error) {
console.error(error);
res = await withTimeout(
(signal) =>
fetch(`https://r.jina.ai/${url}`, {
headers: {
'Accept': 'application/json',
'Authorization': token ? `Bearer ${token}` : '',
'x-send-from': 'LobeChat Community',
},
signal,
}),
DEFAULT_TIMEOUT,
);
} catch (e) {
throw toFetchError(e);
}
return;
if (!res.ok) {
return;
}
const json = await parseJSONResponse<{
code: number;
data: {
content: string;
description?: string;
siteName?: string;
title?: string;
};
}>(res, 'Jina');
if (json.code !== 200) {
return;
}
const result = json.data;
if (!result?.content || result.content.length < 100) {
return;
}
return {
content: result.content,
contentType: 'text',
description: result?.description,
length: result.content.length,
siteName: result?.siteName,
title: result?.title,
url,
};
};

View file

@ -1,8 +1,9 @@
import { ssrfSafeFetch } from '@lobechat/ssrf-safe-fetch';
import type { CrawlImpl, CrawlSuccessResult } from '../type';
import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../utils/errorType';
import { PageNotFoundError, toFetchError } from '../utils/errorType';
import { htmlToMarkdown } from '../utils/htmlToMarkdown';
import { createHTTPStatusError } from '../utils/response';
import { DEFAULT_TIMEOUT, withTimeout } from '../utils/withTimeout';
const mixinHeaders = {
@ -39,28 +40,25 @@ export const naive: CrawlImpl = async (url, { filterOptions }) => {
try {
res = await withTimeout(
ssrfSafeFetch(url, {
headers: mixinHeaders,
signal: new AbortController().signal,
}),
(signal) =>
ssrfSafeFetch(url, {
headers: mixinHeaders,
signal,
}),
DEFAULT_TIMEOUT,
);
} catch (e) {
const error = e as Error;
if (error.message === 'fetch failed') {
throw new NetworkConnectionError();
}
if (error instanceof TimeoutError) {
throw error;
}
throw e;
throw toFetchError(e);
}
if (res.status === 404) {
throw new PageNotFoundError(res.statusText);
}
if (!res.ok) {
throw await createHTTPStatusError(res, 'Naive');
}
const type = res.headers.get('content-type');
if (type?.includes('application/json')) {
@ -74,7 +72,7 @@ export const naive: CrawlImpl = async (url, { filterOptions }) => {
}
return {
content: content,
content,
contentType: 'json',
length: content.length,
url,
@ -91,8 +89,8 @@ export const naive: CrawlImpl = async (url, { filterOptions }) => {
return;
}
// it's blocked by cloudflare
if (result.title !== 'Just a moment...') {
// It's blocked by Cloudflare.
if (result.title === 'Just a moment...') {
return;
}

View file

@ -1,5 +1,6 @@
import type { CrawlImpl, CrawlSuccessResult } from '../type';
import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../utils/errorType';
import { PageNotFoundError, toFetchError } from '../utils/errorType';
import { createHTTPStatusError, parseJSONResponse } from '../utils/response';
import { DEFAULT_TIMEOUT, withTimeout } from '../utils/withTimeout';
interface Search1ApiResponse {
@ -21,29 +22,22 @@ export const search1api: CrawlImpl = async (url) => {
try {
res = await withTimeout(
fetch('https://api.search1api.com/crawl', {
body: JSON.stringify({
url,
(signal) =>
fetch('https://api.search1api.com/crawl', {
body: JSON.stringify({
url,
}),
headers: {
'Authorization': !apiKey ? '' : `Bearer ${apiKey}`,
'Content-Type': 'application/json',
},
method: 'POST',
signal,
}),
headers: {
'Authorization': !apiKey ? '' : `Bearer ${apiKey}`,
'Content-Type': 'application/json',
},
method: 'POST',
}),
DEFAULT_TIMEOUT,
);
} catch (e) {
const error = e as Error;
if (error.message === 'fetch failed') {
throw new NetworkConnectionError();
}
if (error instanceof TimeoutError) {
throw error;
}
throw e;
throw toFetchError(e);
}
if (!res.ok) {
@ -51,30 +45,24 @@ export const search1api: CrawlImpl = async (url) => {
throw new PageNotFoundError(res.statusText);
}
throw new Error(`Search1API request failed with status ${res.status}: ${res.statusText}`);
throw await createHTTPStatusError(res, 'Search1API');
}
try {
const data = (await res.json()) as Search1ApiResponse;
const data = await parseJSONResponse<Search1ApiResponse>(res, 'Search1API');
// Check if content is empty or too short
if (!data.results.content || data.results.content.length < 100) {
return;
}
return {
content: data.results.content,
contentType: 'text',
description: data.results.title,
// Using title as description since API doesn't provide a separate description
length: data.results.content.length,
siteName: new URL(url).hostname,
title: data.results.title,
url: data.results.link || url,
} satisfies CrawlSuccessResult;
} catch (error) {
console.error(error);
// Check if content is empty or too short
if (!data.results?.content || data.results.content.length < 100) {
return;
}
return;
return {
content: data.results.content,
contentType: 'text',
description: data.results?.title,
// Using title as description since API doesn't provide a separate description
length: data.results.content.length,
siteName: new URL(url).hostname,
title: data.results?.title,
url: data.results?.link || url,
} satisfies CrawlSuccessResult;
};

View file

@ -1,5 +1,6 @@
import type { CrawlImpl, CrawlSuccessResult } from '../type';
import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../utils/errorType';
import { PageNotFoundError, toFetchError } from '../utils/errorType';
import { createHTTPStatusError, parseJSONResponse } from '../utils/response';
import { DEFAULT_TIMEOUT, withTimeout } from '../utils/withTimeout';
interface TavilyResults {
@ -28,31 +29,24 @@ export const tavily: CrawlImpl = async (url) => {
try {
res = await withTimeout(
fetch('https://api.tavily.com/extract', {
body: JSON.stringify({
extract_depth: process.env.TAVILY_EXTRACT_DEPTH || 'basic', // basic or advanced
include_images: false,
urls: url,
(signal) =>
fetch('https://api.tavily.com/extract', {
body: JSON.stringify({
extract_depth: process.env.TAVILY_EXTRACT_DEPTH || 'basic', // basic or advanced
include_images: false,
urls: url,
}),
headers: {
'Authorization': !apiKey ? '' : `Bearer ${apiKey}`,
'Content-Type': 'application/json',
},
method: 'POST',
signal,
}),
headers: {
'Authorization': !apiKey ? '' : `Bearer ${apiKey}`,
'Content-Type': 'application/json',
},
method: 'POST',
}),
DEFAULT_TIMEOUT,
);
} catch (e) {
const error = e as Error;
if (error.message === 'fetch failed') {
throw new NetworkConnectionError();
}
if (error instanceof TimeoutError) {
throw error;
}
throw e;
throw toFetchError(e);
}
if (!res.ok) {
@ -60,35 +54,29 @@ export const tavily: CrawlImpl = async (url) => {
throw new PageNotFoundError(res.statusText);
}
throw new Error(`Tavily request failed with status ${res.status}: ${res.statusText}`);
throw await createHTTPStatusError(res, 'Tavily');
}
try {
const data = (await res.json()) as TavilyResponse;
const data = await parseJSONResponse<TavilyResponse>(res, 'Tavily');
if (!data.results || data.results.length === 0) {
console.warn('Tavily API returned no results for URL:', url);
return;
}
const firstResult = data.results[0];
// Check if content is empty or too short
if (!firstResult.raw_content || firstResult.raw_content.length < 100) {
return;
}
return {
content: firstResult.raw_content,
contentType: 'text',
length: firstResult.raw_content.length,
siteName: new URL(url).hostname,
title: new URL(url).hostname,
url: firstResult.url || url,
} satisfies CrawlSuccessResult;
} catch (error) {
console.error(error);
if (!data.results || data.results.length === 0) {
console.warn('Tavily API returned no results for URL:', url);
return;
}
return;
const firstResult = data.results[0];
// Check if content is empty or too short
if (!firstResult.raw_content || firstResult.raw_content.length < 100) {
return;
}
return {
content: firstResult.raw_content,
contentType: 'text',
length: firstResult.raw_content.length,
siteName: new URL(url).hostname,
title: new URL(url).hostname,
url: firstResult.url || url,
} satisfies CrawlSuccessResult;
};

View file

@ -59,13 +59,18 @@ export class Crawler {
try {
const res = await crawlImpls[impl](transformedUrl, { filterOptions: mergedFilterOptions });
if (res && res.content && res.content?.length > 100)
if (res && res.content && res.content.length > 100) {
return {
crawler: impl,
data: res,
originalUrl: url,
transformedUrl: transformedUrl !== url ? transformedUrl : undefined,
};
}
finalError = new Error(`${impl} returned empty or short content`);
finalError.name = 'EmptyCrawlResultError';
finalCrawler = impl;
} catch (error) {
console.error(error);
finalError = error as Error;
@ -77,10 +82,10 @@ export class Crawler {
const errorMessage = finalError?.message;
return {
crawler: finalCrawler!,
crawler: finalCrawler || finalImpls.at(-1) || 'unknown',
data: {
content: `Fail to crawl the page. Error type: ${errorType}, error message: ${errorMessage}`,
errorMessage: errorMessage,
errorMessage,
errorType,
},
originalUrl: url,

View file

@ -0,0 +1,25 @@
import { vi } from 'vitest';
/**
* Create a mock Response object for crawler tests.
* Uses `vi.fn()` for `json`, `text`, and `clone` so individual tests can override them.
*/
export const createMockResponse = (
body: any,
opts: { ok: boolean; status?: number; statusText?: string } = { ok: true },
) => {
const self: any = {
ok: opts.ok,
status: opts.status ?? (opts.ok ? 200 : 500),
statusText: opts.statusText ?? (opts.ok ? 'OK' : 'Internal Server Error'),
json: vi.fn().mockResolvedValue(body),
text: vi.fn().mockResolvedValue(typeof body === 'string' ? body : JSON.stringify(body)),
clone: vi.fn(),
};
self.clone.mockReturnValue({
...self,
json: vi.fn().mockResolvedValue(body),
text: vi.fn().mockResolvedValue(typeof body === 'string' ? body : JSON.stringify(body)),
});
return self;
};

View file

@ -1,6 +1,6 @@
import { describe, expect, it } from 'vitest';
import { applyUrlRules } from './appUrlRules';
import { applyUrlRules } from '../appUrlRules';
describe('applyUrlRules', () => {
// @gru-agent github file rules 不要改

View file

@ -1,6 +1,12 @@
import { describe, expect, it } from 'vitest';
import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../errorType';
import {
isFetchNetworkError,
NetworkConnectionError,
PageNotFoundError,
TimeoutError,
toFetchError,
} from '../errorType';
describe('errorType', () => {
describe('PageNotFoundError', () => {
@ -170,6 +176,43 @@ describe('errorType', () => {
});
});
describe('isFetchNetworkError', () => {
it('should return true for TypeError with "fetch failed" message', () => {
expect(isFetchNetworkError(new TypeError('fetch failed'))).toBe(true);
});
it('should return false for plain Error with "fetch failed" message', () => {
expect(isFetchNetworkError(new Error('fetch failed'))).toBe(false);
});
it('should return false for TypeError with different message', () => {
expect(isFetchNetworkError(new TypeError('something else'))).toBe(false);
});
it('should return false for non-error values', () => {
expect(isFetchNetworkError('fetch failed')).toBe(false);
expect(isFetchNetworkError(null)).toBe(false);
expect(isFetchNetworkError(undefined)).toBe(false);
});
});
describe('toFetchError', () => {
it('should return NetworkConnectionError for fetch network errors', () => {
const result = toFetchError(new TypeError('fetch failed'));
expect(result).toBeInstanceOf(NetworkConnectionError);
});
it('should return TimeoutError as-is', () => {
const timeout = new TimeoutError('Request timeout after 10000ms');
expect(toFetchError(timeout)).toBe(timeout);
});
it('should return unknown errors unchanged', () => {
const unknown = new Error('something unexpected');
expect(toFetchError(unknown)).toBe(unknown);
});
});
describe('error catching scenarios', () => {
it('should allow catching specific error types', () => {
const testErrors = [

View file

@ -0,0 +1,102 @@
import { describe, expect, it } from 'vitest';
import { createHTTPStatusError, parseJSONResponse, ResponseBodyParseError } from '../response';
const createMockResponse = (
body: string,
options: { ok?: boolean; status?: number; statusText?: string } = {},
) => {
const { ok = true, status = 200, statusText = 'OK' } = options;
return new Response(body, {
status,
statusText,
headers: { 'Content-Type': ok ? 'application/json' : 'text/html' },
});
};
describe('ResponseBodyParseError', () => {
it('should create error with provider and body snippet', () => {
const error = new ResponseBodyParseError('Jina', '<html>error</html>');
expect(error.message).toBe('Jina returned non-JSON response: <html>error</html>');
expect(error.name).toBe('ResponseBodyParseError');
});
it('should create error without body snippet', () => {
const error = new ResponseBodyParseError('Firecrawl');
expect(error.message).toBe('Firecrawl returned non-JSON response');
});
});
describe('parseJSONResponse', () => {
it('should parse valid JSON response', async () => {
const data = { code: 200, results: ['a', 'b'] };
const response = createMockResponse(JSON.stringify(data));
const result = await parseJSONResponse<typeof data>(response, 'TestProvider');
expect(result).toEqual(data);
});
it('should throw ResponseBodyParseError for non-JSON response', async () => {
const response = createMockResponse('<html><body>Error</body></html>');
await expect(parseJSONResponse(response, 'Jina')).rejects.toThrow(ResponseBodyParseError);
await expect(
parseJSONResponse(createMockResponse('<html><body>Error</body></html>'), 'Jina'),
).rejects.toThrow('Jina returned non-JSON response');
});
it('should include body snippet in error for non-JSON response', async () => {
const htmlBody = '<html><body>Internal Server Error</body></html>';
const response = createMockResponse(htmlBody);
await expect(parseJSONResponse(response, 'Firecrawl')).rejects.toThrow(
/Firecrawl returned non-JSON response: .*Internal Server Error/,
);
});
it('should handle empty response body', async () => {
const response = createMockResponse('');
await expect(parseJSONResponse(response, 'TestProvider')).rejects.toThrow(
'TestProvider returned non-JSON response',
);
});
});
describe('createHTTPStatusError', () => {
it('should create error with status and body snippet', async () => {
const response = createMockResponse('Not Found', {
ok: false,
status: 404,
statusText: 'Not Found',
});
const error = await createHTTPStatusError(response, 'Exa');
expect(error.message).toContain('Exa request failed with status 404: Not Found');
expect(error.message).toContain('Not Found');
});
it('should create error without body when response text fails', async () => {
const response = createMockResponse('', {
ok: false,
status: 500,
statusText: 'Internal Server Error',
});
const error = await createHTTPStatusError(response, 'Tavily');
expect(error.message).toBe('Tavily request failed with status 500: Internal Server Error');
});
it('should truncate long body snippets', async () => {
const longBody = 'x'.repeat(500);
const response = createMockResponse(longBody, { ok: false, status: 500, statusText: 'Error' });
const error = await createHTTPStatusError(response, 'Test');
// Body snippet should be truncated to 200 chars
expect(error.message.length).toBeLessThan(500 + 100);
});
});

View file

@ -12,18 +12,18 @@ describe('withTimeout', () => {
vi.useRealTimers();
});
it('should resolve when promise resolves before timeout', async () => {
const promise = Promise.resolve('success');
const result = await withTimeout(promise, 1000);
it('should resolve when factory function resolves before timeout', async () => {
const result = await withTimeout(() => Promise.resolve('success'), 1000);
expect(result).toBe('success');
});
it('should reject with TimeoutError when promise takes too long', async () => {
const slowPromise = new Promise((resolve) => {
setTimeout(() => resolve('too late'), 200);
});
it('should reject with TimeoutError when factory takes too long', async () => {
const fn = () =>
new Promise((resolve) => {
setTimeout(() => resolve('too late'), 200);
});
const timeoutPromise = withTimeout(slowPromise, 100);
const timeoutPromise = withTimeout(fn, 100);
vi.advanceTimersByTime(100);
await expect(timeoutPromise).rejects.toThrow(TimeoutError);
@ -31,32 +31,70 @@ describe('withTimeout', () => {
});
it('should use DEFAULT_TIMEOUT when no timeout specified', async () => {
const slowPromise = new Promise((resolve) => {
setTimeout(() => resolve('success'), DEFAULT_TIMEOUT + 100);
});
const fn = () =>
new Promise((resolve) => {
setTimeout(() => resolve('success'), DEFAULT_TIMEOUT + 100);
});
const timeoutPromise = withTimeout(slowPromise);
const timeoutPromise = withTimeout(fn);
vi.advanceTimersByTime(DEFAULT_TIMEOUT);
await expect(timeoutPromise).rejects.toThrow(TimeoutError);
await expect(timeoutPromise).rejects.toThrow(`Request timeout after ${DEFAULT_TIMEOUT}ms`);
});
it('should reject with original error if promise rejects before timeout', async () => {
it('should reject with original error if factory rejects before timeout', async () => {
const error = new Error('Original error');
const failingPromise = Promise.reject(error);
const fn = () => Promise.reject(error);
await expect(withTimeout(failingPromise, 1000)).rejects.toThrow('Original error');
await expect(withTimeout(fn, 1000)).rejects.toThrow('Original error');
});
it('should abort controller when timeout occurs', async () => {
const slowPromise = new Promise((resolve) => {
setTimeout(() => resolve('too late'), 2000);
});
it('should pass AbortSignal to the factory function', async () => {
const factoryFn = vi.fn().mockResolvedValue('result');
await withTimeout(factoryFn, 1000);
const timeoutPromise = withTimeout(slowPromise, 1000);
vi.advanceTimersByTime(1000);
expect(factoryFn).toHaveBeenCalledTimes(1);
const signal = factoryFn.mock.calls[0][0];
expect(signal).toBeInstanceOf(AbortSignal);
expect(signal.aborted).toBe(false);
});
it('should abort the signal when timeout occurs', async () => {
let capturedSignal: AbortSignal | undefined;
const fn = (signal: AbortSignal) => {
capturedSignal = signal;
return new Promise((resolve) => {
setTimeout(() => resolve('too late'), 2000);
});
};
const timeoutPromise = withTimeout(fn, 100);
expect(capturedSignal!.aborted).toBe(false);
vi.advanceTimersByTime(100);
await expect(timeoutPromise).rejects.toThrow(TimeoutError);
expect(capturedSignal!.aborted).toBe(true);
});
it('should clear timeout timer when promise resolves successfully', async () => {
const clearTimeoutSpy = vi.spyOn(globalThis, 'clearTimeout');
await withTimeout(() => Promise.resolve('success'), 5000);
expect(clearTimeoutSpy).toHaveBeenCalled();
clearTimeoutSpy.mockRestore();
});
it('should clear timeout timer when promise rejects', async () => {
const clearTimeoutSpy = vi.spyOn(globalThis, 'clearTimeout');
await expect(withTimeout(() => Promise.reject(new Error('fail')), 5000)).rejects.toThrow(
'fail',
);
expect(clearTimeoutSpy).toHaveBeenCalled();
clearTimeoutSpy.mockRestore();
});
});

View file

@ -17,3 +17,34 @@ export class TimeoutError extends Error {
this.name = 'TimeoutError';
}
}
/**
* Check if an error is a Node.js fetch network failure.
* Node.js undici throws TypeError with message "fetch failed" on network errors.
*/
export const isFetchNetworkError = (error: unknown): boolean =>
error instanceof TypeError && (error as Error).message === 'fetch failed';
/**
* Normalize a fetch error into a typed error for consistent handling.
* Converts network failures to `NetworkConnectionError`, passes through `TimeoutError`,
* and returns any other error unchanged. Callers should `throw` the returned value.
*
* @example
* ```ts
* } catch (e) {
* throw toFetchError(e);
* }
* ```
*/
export const toFetchError = (error: unknown): Error => {
if (isFetchNetworkError(error)) {
return new NetworkConnectionError();
}
if (error instanceof TimeoutError) {
return error;
}
return error as Error;
};

View file

@ -1,5 +1,5 @@
import { readFileSync } from 'node:fs';
import * as path from 'node:path';
import path from 'node:path';
import { describe, expect, it } from 'vitest';
@ -33,4 +33,29 @@ describe('htmlToMarkdown', () => {
expect(data).toMatchSnapshot();
}, 20000);
});
it('should truncate HTML exceeding 1 MB', () => {
// Create HTML slightly over 1 MB
const maxSize = 1024 * 1024;
const largeContent = 'x'.repeat(maxSize + 1000);
const html = `<html><body><p>${largeContent}</p></body></html>`;
// Should not throw - the function handles large HTML by truncating
const result = htmlToMarkdown(html, { url: 'https://example.com', filterOptions: {} });
// Verify content was produced (truncated HTML is still parseable)
expect(result).toBeDefined();
expect(result.content).toBeDefined();
// The output content should be smaller than the input due to truncation
expect(result.content.length).toBeLessThan(html.length);
}, 20000);
it('should not truncate HTML under 1 MB', () => {
const html = '<html><body><p>Small content</p></body></html>';
const result = htmlToMarkdown(html, { url: 'https://example.com', filterOptions: {} });
expect(result).toBeDefined();
expect(result.content).toContain('Small content');
});
});

View file

@ -5,6 +5,9 @@ import { NodeHtmlMarkdown } from 'node-html-markdown';
import type { FilterOptions } from '../type';
/** Truncate HTML to 1 MB before DOM parsing to prevent CPU spikes on large pages */
const MAX_HTML_SIZE = 1024 * 1024;
const cleanObj = <T extends object>(
obj: T,
): {
@ -24,9 +27,10 @@ interface HtmlToMarkdownOutput {
}
export const htmlToMarkdown = (
html: string,
rawHtml: string,
{ url, filterOptions }: { filterOptions: FilterOptions; url: string },
): HtmlToMarkdownOutput => {
const html = rawHtml.length > MAX_HTML_SIZE ? rawHtml.slice(0, MAX_HTML_SIZE) : rawHtml;
const window = new Window({ url });
const document = window.document;

View file

@ -0,0 +1,49 @@
const ERROR_BODY_SNIPPET_LIMIT = 200;
const normalizeBodySnippet = (body: string) => body.replaceAll(/\s+/g, ' ').trim();
export class ResponseBodyParseError extends Error {
constructor(provider: string, bodySnippet?: string) {
super(
bodySnippet
? `${provider} returned non-JSON response: ${bodySnippet}`
: `${provider} returned non-JSON response`,
);
this.name = 'ResponseBodyParseError';
}
}
const getBodySnippet = async (response: Response): Promise<string | undefined> => {
try {
const body = await response.text();
const snippet = normalizeBodySnippet(body).slice(0, ERROR_BODY_SNIPPET_LIMIT);
return snippet.length > 0 ? snippet : undefined;
} catch {
return undefined;
}
};
export const parseJSONResponse = async <T>(response: Response, provider: string): Promise<T> => {
const clonedResponse = response.clone();
try {
return (await response.json()) as T;
} catch {
const bodySnippet = await getBodySnippet(clonedResponse);
throw new ResponseBodyParseError(provider, bodySnippet);
}
};
export const createHTTPStatusError = async (
response: Response,
provider: string,
): Promise<Error> => {
const bodySnippet = await getBodySnippet(response);
return new Error(
bodySnippet
? `${provider} request failed with status ${response.status}: ${response.statusText}. Response: ${bodySnippet}`
: `${provider} request failed with status ${response.status}: ${response.statusText}`,
);
};

View file

@ -3,19 +3,28 @@ import { TimeoutError } from './errorType';
export const DEFAULT_TIMEOUT = 10_000;
/**
* Wraps a promise with a timeout
* @param promise Promise to wrap
* Wraps a factory function with a timeout and abort support.
* The factory receives an AbortSignal that is aborted on timeout,
* allowing the underlying request (e.g. fetch) to be properly cancelled.
* @param fn Factory function that receives an AbortSignal and returns a Promise
* @param ms Timeout in milliseconds
* @returns Promise that will be rejected if it takes longer than ms to resolve
*/
export const withTimeout = <T>(promise: Promise<T>, ms: number = DEFAULT_TIMEOUT): Promise<T> => {
export const withTimeout = <T>(
fn: (signal: AbortSignal) => Promise<T>,
ms: number = DEFAULT_TIMEOUT,
): Promise<T> => {
const controller = new AbortController();
let timeoutId: ReturnType<typeof setTimeout>;
const timeoutPromise = new Promise<T>((_, reject) => {
setTimeout(() => {
timeoutId = setTimeout(() => {
controller.abort();
reject(new TimeoutError(`Request timeout after ${ms}ms`));
}, ms);
});
return Promise.race([promise, timeoutPromise]);
return Promise.race([
fn(controller.signal).finally(() => clearTimeout(timeoutId)),
timeoutPromise,
]);
};

View file

@ -1,15 +1,25 @@
import { createEnv } from '@t3-oss/env-nextjs';
import { z } from 'zod';
const optionalNumberEnv = (min: number, max: number) =>
z.preprocess(
(value) => (value === '' || value === null ? undefined : value),
z.coerce.number().int().max(max).min(min).optional(),
);
export const getToolsConfig = () => {
return createEnv({
runtimeEnv: {
CRAWL_CONCURRENCY: process.env.CRAWL_CONCURRENCY,
CRAWLER_RETRY: process.env.CRAWLER_RETRY,
CRAWLER_IMPLS: process.env.CRAWLER_IMPLS,
SEARCH_PROVIDERS: process.env.SEARCH_PROVIDERS,
SEARXNG_URL: process.env.SEARXNG_URL,
},
server: {
CRAWL_CONCURRENCY: optionalNumberEnv(1, 10),
CRAWLER_RETRY: optionalNumberEnv(0, 3),
CRAWLER_IMPLS: z.string().optional(),
SEARCH_PROVIDERS: z.string().optional(),
SEARXNG_URL: z.string().url().optional(),

View file

@ -46,6 +46,27 @@ describe('searchRouter', () => {
expect(result.results[1]).toEqual({ content: 'test content' });
});
it('should accept all supported crawler implementations', async () => {
const caller = searchRouter.createCaller(mockContext as any);
const allImpls = [
'browserless',
'exa',
'firecrawl',
'jina',
'naive',
'search1api',
'tavily',
] as const;
for (const impl of allImpls) {
const result = await caller.crawlPages({
urls: ['http://test.com'],
impls: [impl],
});
expect(result.results).toHaveLength(1);
}
});
it('should work without specifying impls', async () => {
const caller = searchRouter.createCaller(mockContext as any);

View file

@ -9,7 +9,10 @@ export const searchRouter = router({
crawlPages: searchProcedure
.input(
z.object({
impls: z.enum(['jina', 'naive', 'browserless']).array().optional(),
impls: z
.enum(['browserless', 'exa', 'firecrawl', 'jina', 'naive', 'search1api', 'tavily'])
.array()
.optional(),
urls: z.string().array(),
}),
)

View file

@ -3,7 +3,7 @@ import { beforeEach, describe, expect, it, vi } from 'vitest';
import { toolsEnv } from '@/envs/tools';
import { createSearchServiceImpl,SearchImplType } from './impls';
import { createSearchServiceImpl, SearchImplType } from './impls';
import { SearchService } from './index';
// Mock dependencies
@ -11,7 +11,9 @@ vi.mock('@lobechat/web-crawler');
vi.mock('./impls');
vi.mock('@/envs/tools', () => ({
toolsEnv: {
CRAWL_CONCURRENCY: undefined,
CRAWLER_IMPLS: '',
CRAWLER_RETRY: undefined,
SEARCH_PROVIDERS: '',
},
}));
@ -279,10 +281,9 @@ describe('SearchService', () => {
describe('crawlPages', () => {
it('should crawl multiple pages concurrently', async () => {
const mockCrawlResult = {
content: 'Page content',
description: 'Page description',
title: 'Page title',
url: 'https://example.com',
crawler: 'naive',
data: { content: 'Page content', contentType: 'text' },
originalUrl: 'https://example.com',
};
const mockCrawler = {
@ -304,8 +305,13 @@ describe('SearchService', () => {
it('should use crawler implementations from env', async () => {
vi.mocked(toolsEnv).CRAWLER_IMPLS = 'jina,reader';
const mockSuccessResult = {
crawler: 'jina',
data: { content: 'ok', contentType: 'text' },
originalUrl: 'https://example.com',
};
const mockCrawler = {
crawl: vi.fn().mockResolvedValue({}),
crawl: vi.fn().mockResolvedValue(mockSuccessResult),
};
vi.mocked(Crawler).mockImplementation(() => mockCrawler as any);
@ -317,8 +323,13 @@ describe('SearchService', () => {
});
it('should pass impls parameter to crawler.crawl', async () => {
const mockSuccessResult = {
crawler: 'jina',
data: { content: 'ok', contentType: 'text' },
originalUrl: 'https://example.com',
};
const mockCrawler = {
crawl: vi.fn().mockResolvedValue({}),
crawl: vi.fn().mockResolvedValue(mockSuccessResult),
};
vi.mocked(Crawler).mockImplementation(() => mockCrawler as any);
@ -334,5 +345,133 @@ describe('SearchService', () => {
url: 'https://example.com',
});
});
it('should use CRAWL_CONCURRENCY from env', async () => {
vi.mocked(toolsEnv).CRAWL_CONCURRENCY = 1;
const mockCrawler = {
crawl: vi.fn().mockResolvedValue({
crawler: 'naive',
data: { content: 'ok', contentType: 'text' },
originalUrl: 'https://example.com',
}),
};
vi.mocked(Crawler).mockImplementation(() => mockCrawler as any);
searchService = new SearchService();
const urls = ['https://a.com', 'https://b.com'];
await searchService.crawlPages({ urls });
// All URLs should still be crawled
expect(mockCrawler.crawl).toHaveBeenCalledTimes(2);
});
it('should retry on failed crawl results', async () => {
vi.mocked(toolsEnv).CRAWLER_RETRY = 1;
const failedResult = {
crawler: 'naive',
data: { content: 'Fail', errorType: 'NetworkError', errorMessage: 'timeout' },
originalUrl: 'https://example.com',
};
const successResult = {
crawler: 'naive',
data: { content: 'Page content', contentType: 'text' },
originalUrl: 'https://example.com',
};
const mockCrawler = {
crawl: vi.fn().mockResolvedValueOnce(failedResult).mockResolvedValueOnce(successResult),
};
vi.mocked(Crawler).mockImplementation(() => mockCrawler as any);
searchService = new SearchService();
const result = await searchService.crawlPages({ urls: ['https://example.com'] });
expect(mockCrawler.crawl).toHaveBeenCalledTimes(2);
expect(result.results[0]).toBe(successResult);
});
it('should return last failed result after all retries exhausted', async () => {
vi.mocked(toolsEnv).CRAWLER_RETRY = 1;
const failedResult = {
crawler: 'naive',
data: { content: 'Fail', errorType: 'NetworkError', errorMessage: 'timeout' },
originalUrl: 'https://example.com',
};
const mockCrawler = {
crawl: vi.fn().mockResolvedValue(failedResult),
};
vi.mocked(Crawler).mockImplementation(() => mockCrawler as any);
searchService = new SearchService();
const result = await searchService.crawlPages({ urls: ['https://example.com'] });
expect(mockCrawler.crawl).toHaveBeenCalledTimes(2); // 1 + 1 retry
expect(result.results[0]).toBe(failedResult);
});
it('should not retry when CRAWLER_RETRY is 0', async () => {
vi.mocked(toolsEnv).CRAWLER_RETRY = 0;
const failedResult = {
crawler: 'naive',
data: { content: 'Fail', errorType: 'Error', errorMessage: 'fail' },
originalUrl: 'https://example.com',
};
const mockCrawler = {
crawl: vi.fn().mockResolvedValue(failedResult),
};
vi.mocked(Crawler).mockImplementation(() => mockCrawler as any);
searchService = new SearchService();
const result = await searchService.crawlPages({ urls: ['https://example.com'] });
expect(mockCrawler.crawl).toHaveBeenCalledTimes(1);
expect(result.results[0]).toBe(failedResult);
});
it('should handle crawl exceptions during retry', async () => {
vi.mocked(toolsEnv).CRAWLER_RETRY = 1;
const mockCrawler = {
crawl: vi.fn().mockRejectedValue(new Error('Network error')),
};
vi.mocked(Crawler).mockImplementation(() => mockCrawler as any);
searchService = new SearchService();
const result = await searchService.crawlPages({ urls: ['https://example.com'] });
expect(mockCrawler.crawl).toHaveBeenCalledTimes(2);
expect(result.results[0].data).toMatchObject({
errorType: 'Error',
errorMessage: 'Network error',
});
});
it('should detect successful results by contentType presence', async () => {
vi.mocked(toolsEnv).CRAWLER_RETRY = 1;
const successResult = {
crawler: 'naive',
data: { content: 'Page content', contentType: 'text' },
originalUrl: 'https://example.com',
};
const mockCrawler = {
crawl: vi.fn().mockResolvedValue(successResult),
};
vi.mocked(Crawler).mockImplementation(() => mockCrawler as any);
searchService = new SearchService();
const result = await searchService.crawlPages({ urls: ['https://example.com'] });
// Should not retry since result has contentType (successful)
expect(mockCrawler.crawl).toHaveBeenCalledTimes(1);
expect(result.results[0]).toBe(successResult);
});
});
});

View file

@ -1,5 +1,5 @@
import { type SearchParams, type SearchQuery } from '@lobechat/types';
import { type CrawlImplType } from '@lobechat/web-crawler';
import type { SearchParams, SearchQuery } from '@lobechat/types';
import type { Crawler, CrawlImplType, CrawlUniformResult } from '@lobechat/web-crawler';
import pMap from 'p-map';
import { toolsEnv } from '@/envs/tools';
@ -7,6 +7,9 @@ import { toolsEnv } from '@/envs/tools';
import { type SearchImplType, type SearchServiceImpl } from './impls';
import { createSearchServiceImpl } from './impls';
const DEFAULT_CRAWL_CONCURRENCY = 3;
const DEFAULT_CRAWLER_RETRY = 1;
const parseImplEnv = (envString: string = '') => {
// Handle full-width commas and extra whitespace
const envValue = envString.replaceAll('', ',').trim();
@ -24,6 +27,14 @@ export class SearchService {
return parseImplEnv(toolsEnv.CRAWLER_IMPLS);
}
private get crawlConcurrency() {
return toolsEnv.CRAWL_CONCURRENCY ?? DEFAULT_CRAWL_CONCURRENCY;
}
private get crawlerRetry() {
return toolsEnv.CRAWLER_RETRY ?? DEFAULT_CRAWLER_RETRY;
}
constructor() {
const impls = this.searchImpls;
// TODO: need use turn mode
@ -37,14 +48,59 @@ export class SearchService {
const results = await pMap(
input.urls,
async (url) => {
return await crawler.crawl({ impls: input.impls, url });
return await this.crawlWithRetry(crawler, url, input.impls);
},
{ concurrency: 3 },
{ concurrency: this.crawlConcurrency },
);
return { results };
}
private async crawlWithRetry(
crawler: Crawler,
url: string,
impls?: CrawlImplType[],
): Promise<CrawlUniformResult> {
const maxAttempts = this.crawlerRetry + 1;
let lastResult: CrawlUniformResult | undefined;
let lastError: Error | undefined;
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
try {
const result = await crawler.crawl({ impls, url });
lastResult = result;
if (!this.isFailedCrawlResult(result)) {
return result;
}
} catch (error) {
lastError = error as Error;
}
}
if (lastResult) {
return lastResult;
}
return {
crawler: 'unknown',
data: {
content: `Fail to crawl the page. Error type: ${lastError?.name || 'UnknownError'}, error message: ${lastError?.message}`,
errorMessage: lastError?.message,
errorType: lastError?.name || 'UnknownError',
},
originalUrl: url,
};
}
/**
* A successful crawl result always includes `contentType` (e.g. 'text', 'json')
* in `result.data`, while a failed result contains `errorType`/`errorMessage` instead.
*/
private isFailedCrawlResult(result: CrawlUniformResult): boolean {
return !('contentType' in result.data);
}
private get searchImpls() {
return parseImplEnv(toolsEnv.SEARCH_PROVIDERS) as SearchImplType[];
}
@ -58,17 +114,17 @@ export class SearchService {
async webSearch({ query, searchCategories, searchEngines, searchTimeRange }: SearchQuery) {
let data = await this.query(query, {
searchCategories: searchCategories,
searchEngines: searchEngines,
searchTimeRange: searchTimeRange,
searchCategories,
searchEngines,
searchTimeRange,
});
// First retry: remove search engine restrictions if no results found
if (data.results.length === 0 && searchEngines && searchEngines?.length > 0) {
const paramsExcludeSearchEngines = {
searchCategories: searchCategories,
searchCategories,
searchEngines: undefined,
searchTimeRange: searchTimeRange,
searchTimeRange,
};
data = await this.query(query, paramsExcludeSearchEngines);
}