mirror of
https://github.com/lobehub/lobehub
synced 2026-04-21 09:37:28 +00:00
🐛 fix: improve crawler error handling and timeout cancellation (#12487)
This commit is contained in:
parent
0365a14e16
commit
306c50704e
38 changed files with 1462 additions and 784 deletions
55
.agents/skills/pr/SKILL.md
Normal file
55
.agents/skills/pr/SKILL.md
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
---
|
||||
name: pr
|
||||
description: "Create a PR for the current branch. Use when the user asks to create a pull request, submit PR, or says 'pr'."
|
||||
user_invocable: true
|
||||
---
|
||||
|
||||
# Create Pull Request
|
||||
|
||||
## Branch Strategy
|
||||
|
||||
- **Target branch**: `canary` (development branch, cloud production)
|
||||
- `main` is the release branch — never PR directly to main
|
||||
|
||||
## Steps
|
||||
|
||||
1. **Gather context** (run in parallel):
|
||||
- `git branch --show-current` — current branch name
|
||||
- `git rev-parse --abbrev-ref @{u} 2>/dev/null` — remote tracking status
|
||||
- `git log --oneline origin/canary..HEAD` — unpushed commits
|
||||
- `gh pr list --head "$(git branch --show-current)" --json number,title,state,url` — existing PR
|
||||
- `git log --oneline origin/canary..HEAD` — commit history for PR title
|
||||
- `git diff --stat --stat-count=20 origin/canary..HEAD` — change summary
|
||||
|
||||
2. **Push if needed**:
|
||||
- No upstream: `git push -u origin $(git branch --show-current)`
|
||||
- Has upstream: `git push origin $(git branch --show-current)`
|
||||
|
||||
3. **Search related GitHub issues**:
|
||||
- `gh issue list --search "<keywords>" --state all --limit 10`
|
||||
- Only link issues with matching scope (avoid large umbrella issues)
|
||||
- Skip if no matching issue found
|
||||
|
||||
4. **Create PR** with `gh pr create --base canary`:
|
||||
- Title: `<gitmoji> <type>(<scope>): <description>`
|
||||
- Body: based on PR template (`.github/PULL_REQUEST_TEMPLATE.md`), fill checkboxes
|
||||
- Link related GitHub issues using magic keywords (`Fixes #123`, `Closes #123`)
|
||||
- Link Linear issues if applicable (`Fixes LOBE-xxx`)
|
||||
- Use HEREDOC for body to preserve formatting
|
||||
|
||||
5. **Open in browser**: `gh pr view --web`
|
||||
|
||||
## PR Template
|
||||
|
||||
Use `.github/PULL_REQUEST_TEMPLATE.md` as the body structure. Key sections:
|
||||
|
||||
- **Change Type**: Check the appropriate gitmoji type
|
||||
- **Related Issue**: Link GitHub/Linear issues with magic keywords
|
||||
- **Description of Change**: Summarize what and why
|
||||
- **How to Test**: Describe test approach, check relevant boxes
|
||||
|
||||
## Notes
|
||||
|
||||
- **Release impact**: PR titles with `✨ feat/` or `🐛 fix` trigger releases — use carefully
|
||||
- **Language**: All PR content must be in English
|
||||
- If a PR already exists for the branch, inform the user instead of creating a duplicate
|
||||
|
|
@ -1,3 +1,8 @@
|
|||
---
|
||||
name: upstash-workflow
|
||||
description: 'Upstash Workflow implementation guide. Use when creating async workflows with QStash, implementing fan-out patterns, or building 3-layer workflow architecture (process → paginate → execute).'
|
||||
---
|
||||
|
||||
# Upstash Workflow Implementation Guide
|
||||
|
||||
This guide covers the standard patterns for implementing Upstash Workflow + QStash async workflows in the LobeHub codebase.
|
||||
|
|
|
|||
|
|
@ -38,7 +38,8 @@ lobe-chat/
|
|||
|
||||
### Git Workflow
|
||||
|
||||
- The current release branch is `next` until v2.0.0 is officially released
|
||||
- **Branch strategy**: `canary` is the development branch (cloud production); `main` is the release branch (periodically cherry-picks from canary)
|
||||
- New branches should be created from `canary`; PRs should target `canary`
|
||||
- Use rebase for git pull
|
||||
- Git commit messages should prefix with gitmoji
|
||||
- Git branch name format: `username/feat/feature-name`
|
||||
|
|
|
|||
|
|
@ -33,6 +33,8 @@ lobe-chat/
|
|||
|
||||
### Git Workflow
|
||||
|
||||
- **Branch strategy**: `canary` is the development branch (cloud production); `main` is the release branch (periodically cherry-picks from canary)
|
||||
- New branches should be created from `canary`; PRs should target `canary`
|
||||
- Use rebase for `git pull`
|
||||
- Commit messages: prefix with gitmoji
|
||||
- Branch format: `<type>/<feature-name>`
|
||||
|
|
|
|||
|
|
@ -33,6 +33,8 @@ lobe-chat/
|
|||
|
||||
### Git Workflow
|
||||
|
||||
- **Branch strategy**: `canary` is the development branch (cloud production); `main` is the release branch (periodically cherry-picks from canary)
|
||||
- New branches should be created from `canary`; PRs should target `canary`
|
||||
- Use rebase for `git pull`
|
||||
- Commit messages: prefix with gitmoji
|
||||
- Branch format: `<type>/<feature-name>`
|
||||
|
|
|
|||
|
|
@ -51,6 +51,24 @@ Supported crawler types are listed below:
|
|||
|
||||
---
|
||||
|
||||
## `CRAWL_CONCURRENCY`
|
||||
|
||||
Controls crawler concurrency per crawl task. The default is `3`. On low-resource servers, use `1` to reduce CPU spikes.
|
||||
|
||||
```env
|
||||
CRAWL_CONCURRENCY=3
|
||||
```
|
||||
|
||||
## `CRAWLER_RETRY`
|
||||
|
||||
Controls retry attempts per URL on crawl failures. The default is `1` (up to 2 attempts total).
|
||||
|
||||
```env
|
||||
CRAWLER_RETRY=1
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## `SEARCH_PROVIDERS`
|
||||
|
||||
Configure which search engine providers to use for web search.
|
||||
|
|
|
|||
|
|
@ -46,6 +46,24 @@ CRAWLER_IMPLS="naive,search1api"
|
|||
|
||||
---
|
||||
|
||||
## `CRAWL_CONCURRENCY`
|
||||
|
||||
控制单次网页抓取任务的并发数量,默认值为 `3`。在低配置服务器上建议设置为 `1` 以降低 CPU 峰值。
|
||||
|
||||
```env
|
||||
CRAWL_CONCURRENCY=3
|
||||
```
|
||||
|
||||
## `CRAWLER_RETRY`
|
||||
|
||||
控制单个 URL 的抓取失败重试次数,默认值为 `1`(即最多尝试 2 次)。
|
||||
|
||||
```env
|
||||
CRAWLER_RETRY=1
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## `SEARCH_PROVIDERS`
|
||||
|
||||
配置联网搜索使用的搜索引擎提供商。
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
import { describe, expect, it, vi } from 'vitest';
|
||||
import { beforeEach, describe, expect, it, vi } from 'vitest';
|
||||
|
||||
import { Crawler } from '../crawler';
|
||||
|
||||
|
|
@ -19,6 +19,16 @@ vi.mock('../utils/appUrlRules', () => ({
|
|||
}));
|
||||
|
||||
describe('Crawler', () => {
|
||||
beforeEach(async () => {
|
||||
vi.clearAllMocks();
|
||||
// Reset applyUrlRules to default (no impls override)
|
||||
const { applyUrlRules } = await import('../utils/appUrlRules');
|
||||
vi.mocked(applyUrlRules).mockReturnValue({
|
||||
transformedUrl: 'https://example.com',
|
||||
filterOptions: {},
|
||||
});
|
||||
});
|
||||
|
||||
const crawler = new Crawler();
|
||||
|
||||
it('should crawl successfully with default impls', async () => {
|
||||
|
|
@ -194,11 +204,12 @@ describe('Crawler', () => {
|
|||
});
|
||||
|
||||
expect(result).toEqual({
|
||||
crawler: undefined,
|
||||
crawler: 'browserless',
|
||||
data: {
|
||||
content: 'Fail to crawl the page. Error type: UnknownError, error message: undefined',
|
||||
errorMessage: undefined,
|
||||
errorType: 'UnknownError',
|
||||
content:
|
||||
'Fail to crawl the page. Error type: EmptyCrawlResultError, error message: browserless returned empty or short content',
|
||||
errorMessage: 'browserless returned empty or short content',
|
||||
errorType: 'EmptyCrawlResultError',
|
||||
},
|
||||
originalUrl: 'https://example.com',
|
||||
transformedUrl: undefined,
|
||||
|
|
|
|||
|
|
@ -1,7 +1,13 @@
|
|||
import { describe, expect, it, vi } from 'vitest';
|
||||
|
||||
import * as withTimeoutModule from '../../utils/withTimeout';
|
||||
import { browserless } from '../browserless';
|
||||
|
||||
// Mock withTimeout to just call the factory function directly (bypassing real timeout)
|
||||
vi.spyOn(withTimeoutModule, 'withTimeout').mockImplementation((fn) =>
|
||||
fn(new AbortController().signal),
|
||||
);
|
||||
|
||||
describe('browserless', () => {
|
||||
it('should throw BrowserlessInitError when env vars not set', async () => {
|
||||
const originalEnv = { ...process.env };
|
||||
|
|
@ -16,17 +22,22 @@ describe('browserless', () => {
|
|||
process.env = originalEnv;
|
||||
});
|
||||
|
||||
it('should return undefined on fetch error', async () => {
|
||||
it('should throw NetworkConnectionError on fetch failed', async () => {
|
||||
process.env.BROWSERLESS_TOKEN = 'test-token';
|
||||
global.fetch = vi.fn().mockRejectedValue(new Error('Fetch error'));
|
||||
global.fetch = vi.fn().mockRejectedValue(new TypeError('fetch failed'));
|
||||
|
||||
const result = await browserless('https://example.com', { filterOptions: {} });
|
||||
expect(result).toBeUndefined();
|
||||
const { NetworkConnectionError } = await import('../../utils/errorType');
|
||||
await expect(browserless('https://example.com', { filterOptions: {} })).rejects.toThrow(
|
||||
NetworkConnectionError,
|
||||
);
|
||||
});
|
||||
|
||||
it('should return undefined when content is empty', async () => {
|
||||
process.env.BROWSERLESS_TOKEN = 'test-token';
|
||||
global.fetch = vi.fn().mockResolvedValue({
|
||||
ok: true,
|
||||
status: 200,
|
||||
statusText: 'OK',
|
||||
text: vi.fn().mockResolvedValue('<html></html>'),
|
||||
} as any);
|
||||
|
||||
|
|
@ -37,6 +48,9 @@ describe('browserless', () => {
|
|||
it('should return undefined when title is "Just a moment..."', async () => {
|
||||
process.env.BROWSERLESS_TOKEN = 'test-token';
|
||||
global.fetch = vi.fn().mockResolvedValue({
|
||||
ok: true,
|
||||
status: 200,
|
||||
statusText: 'OK',
|
||||
text: vi.fn().mockResolvedValue('<html><title>Just a moment...</title></html>'),
|
||||
} as any);
|
||||
|
||||
|
|
@ -46,7 +60,12 @@ describe('browserless', () => {
|
|||
|
||||
it('should return crawl result on successful fetch', async () => {
|
||||
process.env.BROWSERLESS_TOKEN = 'test-token';
|
||||
const longContent =
|
||||
'This is a test paragraph with enough content to pass the length check. '.repeat(3);
|
||||
global.fetch = vi.fn().mockResolvedValue({
|
||||
ok: true,
|
||||
status: 200,
|
||||
statusText: 'OK',
|
||||
text: vi.fn().mockResolvedValue(`
|
||||
<html>
|
||||
<head>
|
||||
|
|
@ -54,7 +73,7 @@ describe('browserless', () => {
|
|||
<meta name="description" content="Test Description">
|
||||
</head>
|
||||
<body>
|
||||
<h1>Test Content</h1>
|
||||
<p>${longContent}</p>
|
||||
</body>
|
||||
</html>
|
||||
`),
|
||||
|
|
@ -76,6 +95,9 @@ describe('browserless', () => {
|
|||
it('should include rejectRequestPattern in request payload', async () => {
|
||||
process.env.BROWSERLESS_TOKEN = 'test-token';
|
||||
const fetchMock = vi.fn().mockResolvedValue({
|
||||
ok: true,
|
||||
status: 200,
|
||||
statusText: 'OK',
|
||||
text: vi.fn().mockResolvedValue('<html><title>Test</title></html>'),
|
||||
});
|
||||
global.fetch = fetchMock;
|
||||
|
|
@ -90,9 +112,7 @@ describe('browserless', () => {
|
|||
|
||||
it('should allow requests to permitted file types', async () => {
|
||||
const allowedExtensions = ['html', 'css', 'js', 'json', 'xml', 'webmanifest', 'txt', 'md'];
|
||||
const pattern = new RegExp(
|
||||
'.*\\.(?!(html|css|js|json|xml|webmanifest|txt|md)(\\?|#|$))[\\w-]+(?:[?#].*)?$',
|
||||
);
|
||||
const pattern = /.*\.(?!(html|css|js|json|xml|webmanifest|txt|md)(\?|#|$))[\w-]+(?:[?#].*)?$/;
|
||||
|
||||
allowedExtensions.forEach((ext) => {
|
||||
expect(`file.${ext}`).not.toMatch(pattern);
|
||||
|
|
@ -103,9 +123,7 @@ describe('browserless', () => {
|
|||
|
||||
it('should reject requests to non-permitted file types', async () => {
|
||||
const rejectedExtensions = ['jpg', 'png', 'gif', 'pdf', 'doc', 'mp4', 'wav'];
|
||||
const pattern = new RegExp(
|
||||
'.*\\.(?!(html|css|js|json|xml|webmanifest|txt|md)(\\?|#|$))[\\w-]+(?:[?#].*)?$',
|
||||
);
|
||||
const pattern = /.*\.(?!(html|css|js|json|xml|webmanifest|txt|md)(\?|#|$))[\w-]+(?:[?#].*)?$/;
|
||||
|
||||
rejectedExtensions.forEach((ext) => {
|
||||
expect(`file.${ext}`).toMatch(pattern);
|
||||
|
|
@ -114,14 +132,16 @@ describe('browserless', () => {
|
|||
});
|
||||
});
|
||||
|
||||
it('should use correct URL when BROWSERLESS_URL is provided', async () => {
|
||||
const customUrl = 'https://custom.browserless.io';
|
||||
it('should call fetch with the base URL and content path', async () => {
|
||||
const originalEnv = { ...process.env };
|
||||
process.env.BROWSERLESS_TOKEN = 'test-token';
|
||||
process.env.BROWSERLESS_URL = customUrl;
|
||||
global.fetch = vi.fn().mockImplementation((url) => {
|
||||
expect(url).toContain(customUrl);
|
||||
// BASE_URL is captured at module load time, so we verify fetch is called with /content path
|
||||
expect(url).toContain('/content');
|
||||
return Promise.resolve({
|
||||
ok: true,
|
||||
status: 200,
|
||||
statusText: 'OK',
|
||||
text: () => Promise.resolve('<html><title>Test</title></html>'),
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
import { beforeEach, describe, expect, it, vi } from 'vitest';
|
||||
|
||||
import { createMockResponse } from '../../test-utils';
|
||||
import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../../utils/errorType';
|
||||
import { exa } from '../exa';
|
||||
|
||||
|
|
@ -18,23 +19,20 @@ describe('exa crawler', () => {
|
|||
it('should successfully crawl content with API key', async () => {
|
||||
process.env.EXA_API_KEY = 'test-api-key';
|
||||
|
||||
const mockResponse = {
|
||||
ok: true,
|
||||
json: vi.fn().mockResolvedValue({
|
||||
requestId: 'test-request-id',
|
||||
results: [
|
||||
{
|
||||
id: 'test-id',
|
||||
title: 'Test Article',
|
||||
url: 'https://example.com',
|
||||
text: 'This is a test article with enough content to pass the length check. '.repeat(3),
|
||||
author: 'Test Author',
|
||||
publishedDate: '2023-01-01',
|
||||
summary: 'Test summary',
|
||||
},
|
||||
],
|
||||
}),
|
||||
};
|
||||
const mockResponse = createMockResponse({
|
||||
requestId: 'test-request-id',
|
||||
results: [
|
||||
{
|
||||
id: 'test-id',
|
||||
title: 'Test Article',
|
||||
url: 'https://example.com',
|
||||
text: 'This is a test article with enough content to pass the length check. '.repeat(3),
|
||||
author: 'Test Author',
|
||||
publishedDate: '2023-01-01',
|
||||
summary: 'Test summary',
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
|
||||
|
|
@ -51,23 +49,20 @@ describe('exa crawler', () => {
|
|||
url: 'https://example.com',
|
||||
});
|
||||
|
||||
expect(withTimeout).toHaveBeenCalledWith(expect.any(Promise), 30000);
|
||||
expect(withTimeout).toHaveBeenCalledWith(expect.any(Function), 30000);
|
||||
});
|
||||
|
||||
it('should handle missing API key', async () => {
|
||||
// API key is undefined
|
||||
const mockResponse = {
|
||||
ok: true,
|
||||
json: vi.fn().mockResolvedValue({
|
||||
results: [
|
||||
{
|
||||
title: 'Test Article',
|
||||
url: 'https://example.com',
|
||||
text: 'Test content with sufficient length. '.repeat(5),
|
||||
},
|
||||
],
|
||||
}),
|
||||
};
|
||||
const mockResponse = createMockResponse({
|
||||
results: [
|
||||
{
|
||||
title: 'Test Article',
|
||||
url: 'https://example.com',
|
||||
text: 'Test content with sufficient length. '.repeat(5),
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
|
||||
|
|
@ -75,19 +70,16 @@ describe('exa crawler', () => {
|
|||
await exa('https://example.com', { filterOptions: {} });
|
||||
|
||||
// Check that fetch was called with empty API key header
|
||||
expect(withTimeout).toHaveBeenCalledWith(expect.any(Promise), 30000);
|
||||
expect(withTimeout).toHaveBeenCalledWith(expect.any(Function), 30000);
|
||||
});
|
||||
|
||||
it('should return undefined when no results are returned', async () => {
|
||||
process.env.EXA_API_KEY = 'test-api-key';
|
||||
|
||||
const mockResponse = {
|
||||
ok: true,
|
||||
json: vi.fn().mockResolvedValue({
|
||||
requestId: 'test-request-id',
|
||||
results: [],
|
||||
}),
|
||||
};
|
||||
const mockResponse = createMockResponse({
|
||||
requestId: 'test-request-id',
|
||||
results: [],
|
||||
});
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
|
||||
|
|
@ -108,18 +100,15 @@ describe('exa crawler', () => {
|
|||
it('should return undefined for short content', async () => {
|
||||
process.env.EXA_API_KEY = 'test-api-key';
|
||||
|
||||
const mockResponse = {
|
||||
ok: true,
|
||||
json: vi.fn().mockResolvedValue({
|
||||
results: [
|
||||
{
|
||||
title: 'Test Article',
|
||||
url: 'https://example.com',
|
||||
text: 'Short', // Content too short
|
||||
},
|
||||
],
|
||||
}),
|
||||
};
|
||||
const mockResponse = createMockResponse({
|
||||
results: [
|
||||
{
|
||||
title: 'Test Article',
|
||||
url: 'https://example.com',
|
||||
text: 'Short', // Content too short
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
|
||||
|
|
@ -132,11 +121,11 @@ describe('exa crawler', () => {
|
|||
it('should throw PageNotFoundError for 404 status', async () => {
|
||||
process.env.EXA_API_KEY = 'test-api-key';
|
||||
|
||||
const mockResponse = {
|
||||
const mockResponse = createMockResponse('Not Found', {
|
||||
ok: false,
|
||||
status: 404,
|
||||
statusText: 'Not Found',
|
||||
};
|
||||
});
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
|
||||
|
|
@ -149,11 +138,11 @@ describe('exa crawler', () => {
|
|||
it('should throw error for other HTTP errors', async () => {
|
||||
process.env.EXA_API_KEY = 'test-api-key';
|
||||
|
||||
const mockResponse = {
|
||||
const mockResponse = createMockResponse('', {
|
||||
ok: false,
|
||||
status: 500,
|
||||
statusText: 'Internal Server Error',
|
||||
};
|
||||
});
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
|
||||
|
|
@ -167,7 +156,7 @@ describe('exa crawler', () => {
|
|||
process.env.EXA_API_KEY = 'test-api-key';
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockRejectedValue(new Error('fetch failed'));
|
||||
vi.mocked(withTimeout).mockRejectedValue(new TypeError('fetch failed'));
|
||||
|
||||
await expect(exa('https://example.com', { filterOptions: {} })).rejects.toThrow(
|
||||
NetworkConnectionError,
|
||||
|
|
@ -198,42 +187,37 @@ describe('exa crawler', () => {
|
|||
);
|
||||
});
|
||||
|
||||
it('should return undefined when JSON parsing fails', async () => {
|
||||
it('should throw ResponseBodyParseError when JSON parsing fails', async () => {
|
||||
process.env.EXA_API_KEY = 'test-api-key';
|
||||
|
||||
const mockResponse = {
|
||||
ok: true,
|
||||
const mockResponse = createMockResponse('not json', { ok: true });
|
||||
mockResponse.json = vi.fn().mockRejectedValue(new Error('Invalid JSON'));
|
||||
mockResponse.clone.mockReturnValue({
|
||||
...mockResponse,
|
||||
json: vi.fn().mockRejectedValue(new Error('Invalid JSON')),
|
||||
};
|
||||
text: vi.fn().mockResolvedValue('not json'),
|
||||
});
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
|
||||
|
||||
const consoleSpy = vi.spyOn(console, 'error').mockImplementation(() => {});
|
||||
|
||||
const result = await exa('https://example.com', { filterOptions: {} });
|
||||
|
||||
expect(result).toBeUndefined();
|
||||
expect(consoleSpy).toHaveBeenCalled();
|
||||
|
||||
consoleSpy.mockRestore();
|
||||
await expect(exa('https://example.com', { filterOptions: {} })).rejects.toThrow(
|
||||
'Exa returned non-JSON response: not json',
|
||||
);
|
||||
});
|
||||
|
||||
it('should use result URL when available', async () => {
|
||||
process.env.EXA_API_KEY = 'test-api-key';
|
||||
|
||||
const mockResponse = {
|
||||
ok: true,
|
||||
json: vi.fn().mockResolvedValue({
|
||||
results: [
|
||||
{
|
||||
title: 'Test Article',
|
||||
url: 'https://redirected.example.com',
|
||||
text: 'Test content with sufficient length. '.repeat(5),
|
||||
},
|
||||
],
|
||||
}),
|
||||
};
|
||||
const mockResponse = createMockResponse({
|
||||
results: [
|
||||
{
|
||||
title: 'Test Article',
|
||||
url: 'https://redirected.example.com',
|
||||
text: 'Test content with sufficient length. '.repeat(5),
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
|
||||
|
|
@ -246,18 +230,15 @@ describe('exa crawler', () => {
|
|||
it('should fallback to original URL when result URL is missing', async () => {
|
||||
process.env.EXA_API_KEY = 'test-api-key';
|
||||
|
||||
const mockResponse = {
|
||||
ok: true,
|
||||
json: vi.fn().mockResolvedValue({
|
||||
results: [
|
||||
{
|
||||
title: 'Test Article',
|
||||
text: 'Test content with sufficient length. '.repeat(5),
|
||||
// url is missing
|
||||
},
|
||||
],
|
||||
}),
|
||||
};
|
||||
const mockResponse = createMockResponse({
|
||||
results: [
|
||||
{
|
||||
title: 'Test Article',
|
||||
text: 'Test content with sufficient length. '.repeat(5),
|
||||
// url is missing
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
import { beforeEach, describe, expect, it, vi } from 'vitest';
|
||||
|
||||
import { createMockResponse } from '../../test-utils';
|
||||
import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../../utils/errorType';
|
||||
import { firecrawl } from '../firecrawl';
|
||||
|
||||
|
|
@ -19,25 +20,23 @@ describe('firecrawl crawler', () => {
|
|||
it('should successfully crawl content with API key', async () => {
|
||||
process.env.FIRECRAWL_API_KEY = 'test-api-key';
|
||||
|
||||
const mockResponse = {
|
||||
ok: true,
|
||||
json: vi.fn().mockResolvedValue({
|
||||
success: true,
|
||||
data: {
|
||||
markdown:
|
||||
'This is a test markdown content with enough length to pass validation. '.repeat(3),
|
||||
metadata: {
|
||||
title: 'Test Article',
|
||||
description: 'Test description',
|
||||
sourceURL: 'https://example.com',
|
||||
statusCode: 200,
|
||||
language: 'en',
|
||||
keywords: 'test',
|
||||
robots: 'index',
|
||||
},
|
||||
const mockResponse = createMockResponse({
|
||||
success: true,
|
||||
data: {
|
||||
markdown: 'This is a test markdown content with enough length to pass validation. '.repeat(
|
||||
3,
|
||||
),
|
||||
metadata: {
|
||||
title: 'Test Article',
|
||||
description: 'Test description',
|
||||
sourceURL: 'https://example.com',
|
||||
statusCode: 200,
|
||||
language: 'en',
|
||||
keywords: 'test',
|
||||
robots: 'index',
|
||||
},
|
||||
}),
|
||||
};
|
||||
},
|
||||
});
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
|
||||
|
|
@ -55,58 +54,52 @@ describe('firecrawl crawler', () => {
|
|||
url: 'https://example.com',
|
||||
});
|
||||
|
||||
expect(withTimeout).toHaveBeenCalledWith(expect.any(Promise), 30000);
|
||||
expect(withTimeout).toHaveBeenCalledWith(expect.any(Function), 30000);
|
||||
});
|
||||
|
||||
it('should handle missing API key', async () => {
|
||||
const mockResponse = {
|
||||
ok: true,
|
||||
json: vi.fn().mockResolvedValue({
|
||||
success: true,
|
||||
data: {
|
||||
markdown: 'Test content with sufficient length. '.repeat(5),
|
||||
metadata: {
|
||||
title: 'Test',
|
||||
description: 'Test',
|
||||
sourceURL: 'https://example.com',
|
||||
statusCode: 200,
|
||||
language: 'en',
|
||||
keywords: 'test',
|
||||
robots: 'index',
|
||||
},
|
||||
const mockResponse = createMockResponse({
|
||||
success: true,
|
||||
data: {
|
||||
markdown: 'Test content with sufficient length. '.repeat(5),
|
||||
metadata: {
|
||||
title: 'Test',
|
||||
description: 'Test',
|
||||
sourceURL: 'https://example.com',
|
||||
statusCode: 200,
|
||||
language: 'en',
|
||||
keywords: 'test',
|
||||
robots: 'index',
|
||||
},
|
||||
}),
|
||||
};
|
||||
},
|
||||
});
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
|
||||
|
||||
await firecrawl('https://example.com', { filterOptions: {} });
|
||||
|
||||
expect(withTimeout).toHaveBeenCalledWith(expect.any(Promise), 30000);
|
||||
expect(withTimeout).toHaveBeenCalledWith(expect.any(Function), 30000);
|
||||
});
|
||||
|
||||
it('should return undefined for short content', async () => {
|
||||
process.env.FIRECRAWL_API_KEY = 'test-api-key';
|
||||
|
||||
const mockResponse = {
|
||||
ok: true,
|
||||
json: vi.fn().mockResolvedValue({
|
||||
success: true,
|
||||
data: {
|
||||
markdown: 'Short', // Content too short
|
||||
metadata: {
|
||||
title: 'Test',
|
||||
description: 'Test',
|
||||
sourceURL: 'https://example.com',
|
||||
statusCode: 200,
|
||||
language: 'en',
|
||||
keywords: 'test',
|
||||
robots: 'index',
|
||||
},
|
||||
const mockResponse = createMockResponse({
|
||||
success: true,
|
||||
data: {
|
||||
markdown: 'Short', // Content too short
|
||||
metadata: {
|
||||
title: 'Test',
|
||||
description: 'Test',
|
||||
sourceURL: 'https://example.com',
|
||||
statusCode: 200,
|
||||
language: 'en',
|
||||
keywords: 'test',
|
||||
robots: 'index',
|
||||
},
|
||||
}),
|
||||
};
|
||||
},
|
||||
});
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
|
||||
|
|
@ -119,24 +112,21 @@ describe('firecrawl crawler', () => {
|
|||
it('should return undefined when markdown is missing', async () => {
|
||||
process.env.FIRECRAWL_API_KEY = 'test-api-key';
|
||||
|
||||
const mockResponse = {
|
||||
ok: true,
|
||||
json: vi.fn().mockResolvedValue({
|
||||
success: true,
|
||||
data: {
|
||||
// markdown is missing
|
||||
metadata: {
|
||||
title: 'Test',
|
||||
description: 'Test',
|
||||
sourceURL: 'https://example.com',
|
||||
statusCode: 200,
|
||||
language: 'en',
|
||||
keywords: 'test',
|
||||
robots: 'index',
|
||||
},
|
||||
const mockResponse = createMockResponse({
|
||||
success: true,
|
||||
data: {
|
||||
// markdown is missing
|
||||
metadata: {
|
||||
title: 'Test',
|
||||
description: 'Test',
|
||||
sourceURL: 'https://example.com',
|
||||
statusCode: 200,
|
||||
language: 'en',
|
||||
keywords: 'test',
|
||||
robots: 'index',
|
||||
},
|
||||
}),
|
||||
};
|
||||
},
|
||||
});
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
|
||||
|
|
@ -149,11 +139,11 @@ describe('firecrawl crawler', () => {
|
|||
it('should throw PageNotFoundError for 404 status', async () => {
|
||||
process.env.FIRECRAWL_API_KEY = 'test-api-key';
|
||||
|
||||
const mockResponse = {
|
||||
const mockResponse = createMockResponse('Not Found', {
|
||||
ok: false,
|
||||
status: 404,
|
||||
statusText: 'Not Found',
|
||||
};
|
||||
});
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
|
||||
|
|
@ -166,11 +156,11 @@ describe('firecrawl crawler', () => {
|
|||
it('should throw error for other HTTP errors', async () => {
|
||||
process.env.FIRECRAWL_API_KEY = 'test-api-key';
|
||||
|
||||
const mockResponse = {
|
||||
const mockResponse = createMockResponse('', {
|
||||
ok: false,
|
||||
status: 500,
|
||||
statusText: 'Internal Server Error',
|
||||
};
|
||||
});
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
|
||||
|
|
@ -184,7 +174,7 @@ describe('firecrawl crawler', () => {
|
|||
process.env.FIRECRAWL_API_KEY = 'test-api-key';
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockRejectedValue(new Error('fetch failed'));
|
||||
vi.mocked(withTimeout).mockRejectedValue(new TypeError('fetch failed'));
|
||||
|
||||
await expect(firecrawl('https://example.com', { filterOptions: {} })).rejects.toThrow(
|
||||
NetworkConnectionError,
|
||||
|
|
@ -217,54 +207,49 @@ describe('firecrawl crawler', () => {
|
|||
);
|
||||
});
|
||||
|
||||
it('should return undefined when JSON parsing fails', async () => {
|
||||
it('should throw ResponseBodyParseError when JSON parsing fails', async () => {
|
||||
process.env.FIRECRAWL_API_KEY = 'test-api-key';
|
||||
|
||||
const mockResponse = {
|
||||
ok: true,
|
||||
const mockResponse = createMockResponse('not json', { ok: true });
|
||||
mockResponse.json = vi.fn().mockRejectedValue(new Error('Invalid JSON'));
|
||||
mockResponse.clone.mockReturnValue({
|
||||
...mockResponse,
|
||||
json: vi.fn().mockRejectedValue(new Error('Invalid JSON')),
|
||||
};
|
||||
text: vi.fn().mockResolvedValue('not json'),
|
||||
});
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
|
||||
|
||||
const consoleSpy = vi.spyOn(console, 'error').mockImplementation(() => {});
|
||||
|
||||
const result = await firecrawl('https://example.com', { filterOptions: {} });
|
||||
|
||||
expect(result).toBeUndefined();
|
||||
expect(consoleSpy).toHaveBeenCalled();
|
||||
|
||||
consoleSpy.mockRestore();
|
||||
await expect(firecrawl('https://example.com', { filterOptions: {} })).rejects.toThrow(
|
||||
'Firecrawl returned non-JSON response: not json',
|
||||
);
|
||||
});
|
||||
|
||||
it('should handle metadata with all optional fields', async () => {
|
||||
process.env.FIRECRAWL_API_KEY = 'test-api-key';
|
||||
|
||||
const mockResponse = {
|
||||
ok: true,
|
||||
json: vi.fn().mockResolvedValue({
|
||||
success: true,
|
||||
data: {
|
||||
markdown: 'Complete test content with all metadata fields provided. '.repeat(3),
|
||||
metadata: {
|
||||
title: 'Complete Test Article',
|
||||
description: 'Complete test description',
|
||||
keywords: 'test,complete,article',
|
||||
language: 'en',
|
||||
ogDescription: 'OG description',
|
||||
ogImage: 'https://example.com/image.jpg',
|
||||
ogLocaleAlternate: ['en-US', 'fr-FR'],
|
||||
ogSiteName: 'Example Site',
|
||||
ogTitle: 'OG Title',
|
||||
ogUrl: 'https://example.com/og',
|
||||
robots: 'index,follow',
|
||||
statusCode: 200,
|
||||
sourceURL: 'https://example.com',
|
||||
},
|
||||
const mockResponse = createMockResponse({
|
||||
success: true,
|
||||
data: {
|
||||
markdown: 'Complete test content with all metadata fields provided. '.repeat(3),
|
||||
metadata: {
|
||||
title: 'Complete Test Article',
|
||||
description: 'Complete test description',
|
||||
keywords: 'test,complete,article',
|
||||
language: 'en',
|
||||
ogDescription: 'OG description',
|
||||
ogImage: 'https://example.com/image.jpg',
|
||||
ogLocaleAlternate: ['en-US', 'fr-FR'],
|
||||
ogSiteName: 'Example Site',
|
||||
ogTitle: 'OG Title',
|
||||
ogUrl: 'https://example.com/og',
|
||||
robots: 'index,follow',
|
||||
statusCode: 200,
|
||||
sourceURL: 'https://example.com',
|
||||
},
|
||||
}),
|
||||
};
|
||||
},
|
||||
});
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
|
||||
|
|
|
|||
|
|
@ -1,29 +1,44 @@
|
|||
import { beforeEach, describe, expect, it, vi } from 'vitest';
|
||||
|
||||
import { createMockResponse } from '../../test-utils';
|
||||
import * as withTimeoutModule from '../../utils/withTimeout';
|
||||
import { jina } from '../jina';
|
||||
|
||||
// Mock withTimeout to just call the factory function directly (bypassing real timeout)
|
||||
vi.spyOn(withTimeoutModule, 'withTimeout').mockImplementation((fn) =>
|
||||
fn(new AbortController().signal),
|
||||
);
|
||||
|
||||
describe('jina crawler', () => {
|
||||
const mockFetch = vi.fn();
|
||||
global.fetch = mockFetch;
|
||||
|
||||
beforeEach(() => {
|
||||
vi.resetAllMocks();
|
||||
// Re-apply the withTimeout spy after resetAllMocks
|
||||
vi.spyOn(withTimeoutModule, 'withTimeout').mockImplementation((fn) =>
|
||||
fn(new AbortController().signal),
|
||||
);
|
||||
});
|
||||
|
||||
it('should crawl url successfully', async () => {
|
||||
const mockResponse = {
|
||||
ok: true,
|
||||
json: () =>
|
||||
Promise.resolve({
|
||||
code: 200,
|
||||
data: {
|
||||
content: 'test content',
|
||||
description: 'test description',
|
||||
siteName: 'test site',
|
||||
title: 'test title',
|
||||
},
|
||||
}),
|
||||
};
|
||||
const testContent =
|
||||
'This is a test content that is long enough to pass the minimum length validation check. '.repeat(
|
||||
2,
|
||||
);
|
||||
|
||||
const mockResponse = createMockResponse(
|
||||
{
|
||||
code: 200,
|
||||
data: {
|
||||
content: testContent,
|
||||
description: 'test description',
|
||||
siteName: 'test site',
|
||||
title: 'test title',
|
||||
},
|
||||
},
|
||||
{ ok: true },
|
||||
);
|
||||
|
||||
mockFetch.mockResolvedValue(mockResponse);
|
||||
|
||||
|
|
@ -38,13 +53,14 @@ describe('jina crawler', () => {
|
|||
'Authorization': 'Bearer test-key',
|
||||
'x-send-from': 'LobeChat Community',
|
||||
},
|
||||
signal: expect.any(AbortSignal),
|
||||
});
|
||||
|
||||
expect(result).toEqual({
|
||||
content: 'test content',
|
||||
content: testContent,
|
||||
contentType: 'text',
|
||||
description: 'test description',
|
||||
length: 12,
|
||||
length: testContent.length,
|
||||
siteName: 'test site',
|
||||
title: 'test title',
|
||||
url: 'https://example.com',
|
||||
|
|
@ -54,16 +70,15 @@ describe('jina crawler', () => {
|
|||
it('should use JINA_READER_API_KEY from env if apiKey not provided', async () => {
|
||||
process.env.JINA_READER_API_KEY = 'env-reader-key';
|
||||
|
||||
const mockResponse = {
|
||||
ok: true,
|
||||
json: () =>
|
||||
Promise.resolve({
|
||||
code: 200,
|
||||
data: {
|
||||
content: 'test content',
|
||||
},
|
||||
}),
|
||||
};
|
||||
const mockResponse = createMockResponse(
|
||||
{
|
||||
code: 200,
|
||||
data: {
|
||||
content: 'test content',
|
||||
},
|
||||
},
|
||||
{ ok: true },
|
||||
);
|
||||
|
||||
mockFetch.mockResolvedValue(mockResponse);
|
||||
|
||||
|
|
@ -75,6 +90,7 @@ describe('jina crawler', () => {
|
|||
'Authorization': 'Bearer env-reader-key',
|
||||
'x-send-from': 'LobeChat Community',
|
||||
},
|
||||
signal: expect.any(AbortSignal),
|
||||
});
|
||||
|
||||
delete process.env.JINA_READER_API_KEY;
|
||||
|
|
@ -83,16 +99,15 @@ describe('jina crawler', () => {
|
|||
it('should use JINA_API_KEY from env if apiKey and JINA_READER_API_KEY not provided', async () => {
|
||||
process.env.JINA_API_KEY = 'env-key';
|
||||
|
||||
const mockResponse = {
|
||||
ok: true,
|
||||
json: () =>
|
||||
Promise.resolve({
|
||||
code: 200,
|
||||
data: {
|
||||
content: 'test content',
|
||||
},
|
||||
}),
|
||||
};
|
||||
const mockResponse = createMockResponse(
|
||||
{
|
||||
code: 200,
|
||||
data: {
|
||||
content: 'test content',
|
||||
},
|
||||
},
|
||||
{ ok: true },
|
||||
);
|
||||
|
||||
mockFetch.mockResolvedValue(mockResponse);
|
||||
|
||||
|
|
@ -104,22 +119,22 @@ describe('jina crawler', () => {
|
|||
'Authorization': 'Bearer env-key',
|
||||
'x-send-from': 'LobeChat Community',
|
||||
},
|
||||
signal: expect.any(AbortSignal),
|
||||
});
|
||||
|
||||
delete process.env.JINA_API_KEY;
|
||||
});
|
||||
|
||||
it('should send empty Authorization header if no api key provided', async () => {
|
||||
const mockResponse = {
|
||||
ok: true,
|
||||
json: () =>
|
||||
Promise.resolve({
|
||||
code: 200,
|
||||
data: {
|
||||
content: 'test content',
|
||||
},
|
||||
}),
|
||||
};
|
||||
const mockResponse = createMockResponse(
|
||||
{
|
||||
code: 200,
|
||||
data: {
|
||||
content: 'test content',
|
||||
},
|
||||
},
|
||||
{ ok: true },
|
||||
);
|
||||
|
||||
mockFetch.mockResolvedValue(mockResponse);
|
||||
|
||||
|
|
@ -131,11 +146,14 @@ describe('jina crawler', () => {
|
|||
'Authorization': '',
|
||||
'x-send-from': 'LobeChat Community',
|
||||
},
|
||||
signal: expect.any(AbortSignal),
|
||||
});
|
||||
});
|
||||
|
||||
it('should return undefined if response is not ok', async () => {
|
||||
mockFetch.mockResolvedValue({ ok: false });
|
||||
mockFetch.mockResolvedValue(
|
||||
createMockResponse(null, { ok: false, status: 500, statusText: 'Internal Server Error' }),
|
||||
);
|
||||
|
||||
const result = await jina('https://example.com', { filterOptions: {} });
|
||||
|
||||
|
|
@ -143,14 +161,13 @@ describe('jina crawler', () => {
|
|||
});
|
||||
|
||||
it('should return undefined if response code is not 200', async () => {
|
||||
const mockResponse = {
|
||||
ok: true,
|
||||
json: () =>
|
||||
Promise.resolve({
|
||||
code: 400,
|
||||
message: 'Bad Request',
|
||||
}),
|
||||
};
|
||||
const mockResponse = createMockResponse(
|
||||
{
|
||||
code: 400,
|
||||
message: 'Bad Request',
|
||||
},
|
||||
{ ok: true },
|
||||
);
|
||||
|
||||
mockFetch.mockResolvedValue(mockResponse);
|
||||
|
||||
|
|
@ -159,11 +176,11 @@ describe('jina crawler', () => {
|
|||
expect(result).toBeUndefined();
|
||||
});
|
||||
|
||||
it('should return undefined if fetch throws error', async () => {
|
||||
it('should throw error if fetch throws non-fetch-failed error', async () => {
|
||||
mockFetch.mockRejectedValue(new Error('Network error'));
|
||||
|
||||
const result = await jina('https://example.com', { filterOptions: {} });
|
||||
|
||||
expect(result).toBeUndefined();
|
||||
await expect(jina('https://example.com', { filterOptions: {} })).rejects.toThrow(
|
||||
'Network error',
|
||||
);
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -22,9 +22,10 @@ describe('naive crawler', () => {
|
|||
vi.clearAllMocks();
|
||||
});
|
||||
|
||||
it('should return undefined for normal pages (due to cloudflare logic)', async () => {
|
||||
it('should return content for normal pages', async () => {
|
||||
const mockResponse = {
|
||||
status: 200,
|
||||
ok: true,
|
||||
headers: new Map([['content-type', 'text/html']]),
|
||||
text: vi.fn().mockResolvedValue('<html><body>Test content</body></html>'),
|
||||
};
|
||||
|
|
@ -34,8 +35,8 @@ describe('naive crawler', () => {
|
|||
|
||||
const { htmlToMarkdown } = await import('../../utils/htmlToMarkdown');
|
||||
vi.mocked(htmlToMarkdown).mockReturnValue({
|
||||
content: 'Test content'.padEnd(101, ' '), // Ensure length > 100
|
||||
title: 'Normal Page Title', // Not "Just a moment..." so it returns undefined
|
||||
content: 'Test content'.padEnd(101, ' '),
|
||||
title: 'Normal Page Title',
|
||||
description: 'Test description',
|
||||
siteName: 'Test Site',
|
||||
length: 101,
|
||||
|
|
@ -43,13 +44,22 @@ describe('naive crawler', () => {
|
|||
|
||||
const result = await naive('https://example.com', { filterOptions: {} });
|
||||
|
||||
expect(result).toBeUndefined();
|
||||
expect(result).toEqual({
|
||||
content: 'Test content'.padEnd(101, ' '),
|
||||
contentType: 'text',
|
||||
description: 'Test description',
|
||||
length: 101,
|
||||
siteName: 'Test Site',
|
||||
title: 'Normal Page Title',
|
||||
url: 'https://example.com',
|
||||
});
|
||||
});
|
||||
|
||||
it('should successfully crawl JSON content', async () => {
|
||||
const mockJsonData = { message: 'Hello world', data: [1, 2, 3] };
|
||||
const mockResponse = {
|
||||
status: 200,
|
||||
ok: true,
|
||||
headers: new Map([['content-type', 'application/json']]),
|
||||
clone: () => ({
|
||||
json: vi.fn().mockResolvedValue(mockJsonData),
|
||||
|
|
@ -74,6 +84,7 @@ describe('naive crawler', () => {
|
|||
const mockText = '{"invalid": json}';
|
||||
const mockResponse = {
|
||||
status: 200,
|
||||
ok: true,
|
||||
headers: new Map([['content-type', 'application/json']]),
|
||||
clone: () => ({
|
||||
json: vi.fn().mockRejectedValue(new Error('Invalid JSON')),
|
||||
|
|
@ -97,6 +108,7 @@ describe('naive crawler', () => {
|
|||
it('should return undefined for short content', async () => {
|
||||
const mockResponse = {
|
||||
status: 200,
|
||||
ok: true,
|
||||
headers: new Map([['content-type', 'text/html']]),
|
||||
text: vi.fn().mockResolvedValue('<html><body>Short</body></html>'),
|
||||
};
|
||||
|
|
@ -116,9 +128,10 @@ describe('naive crawler', () => {
|
|||
expect(result).toBeUndefined();
|
||||
});
|
||||
|
||||
it('should return content when NOT blocked by Cloudflare', async () => {
|
||||
it('should return undefined when blocked by Cloudflare', async () => {
|
||||
const mockResponse = {
|
||||
status: 200,
|
||||
ok: true,
|
||||
headers: new Map([['content-type', 'text/html']]),
|
||||
text: vi.fn().mockResolvedValue('<html><body>Normal content</body></html>'),
|
||||
};
|
||||
|
|
@ -129,7 +142,7 @@ describe('naive crawler', () => {
|
|||
const { htmlToMarkdown } = await import('../../utils/htmlToMarkdown');
|
||||
vi.mocked(htmlToMarkdown).mockReturnValue({
|
||||
content: 'Test content'.padEnd(101, ' '),
|
||||
title: 'Just a moment...', // Cloudflare blocking page - this will cause return
|
||||
title: 'Just a moment...', // Cloudflare blocking page
|
||||
description: 'Test description',
|
||||
siteName: 'Test Site',
|
||||
length: 101,
|
||||
|
|
@ -137,15 +150,21 @@ describe('naive crawler', () => {
|
|||
|
||||
const result = await naive('https://example.com', { filterOptions: {} });
|
||||
|
||||
expect(result).toEqual({
|
||||
content: 'Test content'.padEnd(101, ' '),
|
||||
contentType: 'text',
|
||||
description: 'Test description',
|
||||
length: 101,
|
||||
siteName: 'Test Site',
|
||||
title: 'Just a moment...',
|
||||
url: 'https://example.com',
|
||||
});
|
||||
expect(result).toBeUndefined();
|
||||
});
|
||||
|
||||
it('should throw error for non-ok status codes', async () => {
|
||||
const mockResponse = {
|
||||
status: 500,
|
||||
ok: false,
|
||||
statusText: 'Internal Server Error',
|
||||
text: vi.fn().mockResolvedValue('Server Error'),
|
||||
};
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
|
||||
|
||||
await expect(naive('https://example.com', { filterOptions: {} })).rejects.toThrow(/500/);
|
||||
});
|
||||
|
||||
it('should throw PageNotFoundError for 404 status', async () => {
|
||||
|
|
@ -164,7 +183,7 @@ describe('naive crawler', () => {
|
|||
|
||||
it('should throw NetworkConnectionError for fetch failures', async () => {
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockRejectedValue(new Error('fetch failed'));
|
||||
vi.mocked(withTimeout).mockRejectedValue(new TypeError('fetch failed'));
|
||||
|
||||
await expect(naive('https://example.com', { filterOptions: {} })).rejects.toThrow(
|
||||
NetworkConnectionError,
|
||||
|
|
@ -194,6 +213,7 @@ describe('naive crawler', () => {
|
|||
it('should return undefined when HTML processing fails', async () => {
|
||||
const mockResponse = {
|
||||
status: 200,
|
||||
ok: true,
|
||||
headers: new Map([['content-type', 'text/html']]),
|
||||
text: vi.fn().mockRejectedValue(new Error('Failed to read text')),
|
||||
};
|
||||
|
|
@ -209,6 +229,7 @@ describe('naive crawler', () => {
|
|||
it('should pass filter options to htmlToMarkdown', async () => {
|
||||
const mockResponse = {
|
||||
status: 200,
|
||||
ok: true,
|
||||
headers: new Map([['content-type', 'text/html']]),
|
||||
text: vi.fn().mockResolvedValue('<html><body>Test content</body></html>'),
|
||||
};
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
|
||||
|
||||
import { createMockResponse } from '../../test-utils';
|
||||
import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../../utils/errorType';
|
||||
import * as withTimeoutModule from '../../utils/withTimeout';
|
||||
import { search1api } from '../search1api';
|
||||
|
|
@ -17,8 +18,10 @@ describe('search1api crawler', () => {
|
|||
originalEnv = { ...process.env };
|
||||
process.env.SEARCH1API_API_KEY = 'test-api-key';
|
||||
|
||||
// Mock withTimeout to directly return the promise
|
||||
vi.spyOn(withTimeoutModule, 'withTimeout').mockImplementation((promise) => promise);
|
||||
// Mock withTimeout to call the factory function directly (bypassing real timeout)
|
||||
vi.spyOn(withTimeoutModule, 'withTimeout').mockImplementation((fn) =>
|
||||
fn(new AbortController().signal),
|
||||
);
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
|
|
@ -26,7 +29,7 @@ describe('search1api crawler', () => {
|
|||
});
|
||||
|
||||
it('should throw NetworkConnectionError when fetch fails', async () => {
|
||||
mockFetch.mockRejectedValue(new Error('fetch failed'));
|
||||
mockFetch.mockRejectedValue(new TypeError('fetch failed'));
|
||||
|
||||
await expect(search1api('https://example.com', { filterOptions: {} })).rejects.toThrow(
|
||||
NetworkConnectionError,
|
||||
|
|
@ -48,11 +51,13 @@ describe('search1api crawler', () => {
|
|||
});
|
||||
|
||||
it('should throw PageNotFoundError when status is 404', async () => {
|
||||
mockFetch.mockResolvedValue({
|
||||
ok: false,
|
||||
status: 404,
|
||||
statusText: 'Not Found',
|
||||
});
|
||||
mockFetch.mockResolvedValue(
|
||||
createMockResponse('Not Found', {
|
||||
ok: false,
|
||||
status: 404,
|
||||
statusText: 'Not Found',
|
||||
}),
|
||||
);
|
||||
|
||||
await expect(search1api('https://example.com', { filterOptions: {} })).rejects.toThrow(
|
||||
PageNotFoundError,
|
||||
|
|
@ -60,11 +65,13 @@ describe('search1api crawler', () => {
|
|||
});
|
||||
|
||||
it('should throw error for other failed responses', async () => {
|
||||
mockFetch.mockResolvedValue({
|
||||
ok: false,
|
||||
status: 500,
|
||||
statusText: 'Internal Server Error',
|
||||
});
|
||||
mockFetch.mockResolvedValue(
|
||||
createMockResponse('', {
|
||||
ok: false,
|
||||
status: 500,
|
||||
statusText: 'Internal Server Error',
|
||||
}),
|
||||
);
|
||||
|
||||
await expect(search1api('https://example.com', { filterOptions: {} })).rejects.toThrow(
|
||||
'Search1API request failed with status 500: Internal Server Error',
|
||||
|
|
@ -72,18 +79,19 @@ describe('search1api crawler', () => {
|
|||
});
|
||||
|
||||
it('should return undefined when content is too short', async () => {
|
||||
mockFetch.mockResolvedValue({
|
||||
ok: true,
|
||||
json: () =>
|
||||
Promise.resolve({
|
||||
mockFetch.mockResolvedValue(
|
||||
createMockResponse(
|
||||
{
|
||||
crawlParameters: { url: 'https://example.com' },
|
||||
results: {
|
||||
title: 'Test Title',
|
||||
link: 'https://example.com',
|
||||
content: 'Short', // Less than 100 characters
|
||||
},
|
||||
}),
|
||||
});
|
||||
},
|
||||
{ ok: true },
|
||||
),
|
||||
);
|
||||
|
||||
const result = await search1api('https://example.com', { filterOptions: {} });
|
||||
expect(result).toBeUndefined();
|
||||
|
|
@ -92,18 +100,19 @@ describe('search1api crawler', () => {
|
|||
it('should return crawl result on successful fetch', async () => {
|
||||
const mockContent = 'This is a test content that is longer than 100 characters. '.repeat(3);
|
||||
|
||||
mockFetch.mockResolvedValue({
|
||||
ok: true,
|
||||
json: () =>
|
||||
Promise.resolve({
|
||||
mockFetch.mockResolvedValue(
|
||||
createMockResponse(
|
||||
{
|
||||
crawlParameters: { url: 'https://example.com' },
|
||||
results: {
|
||||
title: 'Test Title',
|
||||
link: 'https://example.com',
|
||||
content: mockContent,
|
||||
},
|
||||
}),
|
||||
});
|
||||
},
|
||||
{ ok: true },
|
||||
),
|
||||
);
|
||||
|
||||
const result = await search1api('https://example.com', { filterOptions: {} });
|
||||
|
||||
|
|
@ -116,6 +125,7 @@ describe('search1api crawler', () => {
|
|||
body: JSON.stringify({
|
||||
url: 'https://example.com',
|
||||
}),
|
||||
signal: expect.any(AbortSignal),
|
||||
});
|
||||
|
||||
expect(result).toEqual({
|
||||
|
|
@ -130,12 +140,18 @@ describe('search1api crawler', () => {
|
|||
});
|
||||
|
||||
it('should handle JSON parse errors', async () => {
|
||||
mockFetch.mockResolvedValue({
|
||||
ok: true,
|
||||
json: () => Promise.reject(new Error('Invalid JSON')),
|
||||
});
|
||||
mockFetch.mockResolvedValue(createMockResponse('invalid json', { ok: true }));
|
||||
// Override json to reject for this specific test
|
||||
const response = createMockResponse('invalid json', { ok: true });
|
||||
response.json = () => Promise.reject(new Error('Invalid JSON'));
|
||||
// clone should also return a response whose text() works for error reporting
|
||||
response.clone = () => {
|
||||
const cloned = createMockResponse('invalid json', { ok: true });
|
||||
cloned.json = () => Promise.reject(new Error('Invalid JSON'));
|
||||
return cloned;
|
||||
};
|
||||
mockFetch.mockResolvedValue(response);
|
||||
|
||||
const result = await search1api('https://example.com', { filterOptions: {} });
|
||||
expect(result).toBeUndefined();
|
||||
await expect(search1api('https://example.com', { filterOptions: {} })).rejects.toThrow();
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
import { beforeEach, describe, expect, it, vi } from 'vitest';
|
||||
|
||||
import { createMockResponse } from '../../test-utils';
|
||||
import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../../utils/errorType';
|
||||
import { tavily } from '../tavily';
|
||||
|
||||
|
|
@ -19,21 +20,18 @@ describe('tavily crawler', () => {
|
|||
it('should successfully crawl content with API key', async () => {
|
||||
process.env.TAVILY_API_KEY = 'test-api-key';
|
||||
|
||||
const mockResponse = {
|
||||
ok: true,
|
||||
json: vi.fn().mockResolvedValue({
|
||||
base_url: 'https://api.tavily.com',
|
||||
response_time: 1.5,
|
||||
results: [
|
||||
{
|
||||
url: 'https://example.com',
|
||||
raw_content:
|
||||
'This is a test raw content with sufficient length to pass validation. '.repeat(3),
|
||||
images: ['https://example.com/image1.jpg', 'https://example.com/image2.jpg'],
|
||||
},
|
||||
],
|
||||
}),
|
||||
};
|
||||
const mockResponse = createMockResponse({
|
||||
base_url: 'https://api.tavily.com',
|
||||
response_time: 1.5,
|
||||
results: [
|
||||
{
|
||||
url: 'https://example.com',
|
||||
raw_content:
|
||||
'This is a test raw content with sufficient length to pass validation. '.repeat(3),
|
||||
images: ['https://example.com/image1.jpg', 'https://example.com/image2.jpg'],
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
|
||||
|
|
@ -50,69 +48,60 @@ describe('tavily crawler', () => {
|
|||
url: 'https://example.com',
|
||||
});
|
||||
|
||||
expect(withTimeout).toHaveBeenCalledWith(expect.any(Promise), 30000);
|
||||
expect(withTimeout).toHaveBeenCalledWith(expect.any(Function), 30000);
|
||||
});
|
||||
|
||||
it('should use custom extract depth when provided', async () => {
|
||||
process.env.TAVILY_API_KEY = 'test-api-key';
|
||||
process.env.TAVILY_EXTRACT_DEPTH = 'advanced';
|
||||
|
||||
const mockResponse = {
|
||||
ok: true,
|
||||
json: vi.fn().mockResolvedValue({
|
||||
base_url: 'https://api.tavily.com',
|
||||
response_time: 2.1,
|
||||
results: [
|
||||
{
|
||||
url: 'https://example.com',
|
||||
raw_content: 'Advanced extraction content with more details. '.repeat(5),
|
||||
},
|
||||
],
|
||||
}),
|
||||
};
|
||||
const mockResponse = createMockResponse({
|
||||
base_url: 'https://api.tavily.com',
|
||||
response_time: 2.1,
|
||||
results: [
|
||||
{
|
||||
url: 'https://example.com',
|
||||
raw_content: 'Advanced extraction content with more details. '.repeat(5),
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
|
||||
|
||||
await tavily('https://example.com', { filterOptions: {} });
|
||||
|
||||
expect(withTimeout).toHaveBeenCalledWith(expect.any(Promise), 30000);
|
||||
expect(withTimeout).toHaveBeenCalledWith(expect.any(Function), 30000);
|
||||
});
|
||||
|
||||
it('should handle missing API key', async () => {
|
||||
const mockResponse = {
|
||||
ok: true,
|
||||
json: vi.fn().mockResolvedValue({
|
||||
base_url: 'https://api.tavily.com',
|
||||
response_time: 1.2,
|
||||
results: [
|
||||
{
|
||||
url: 'https://example.com',
|
||||
raw_content: 'Test content with sufficient length. '.repeat(5),
|
||||
},
|
||||
],
|
||||
}),
|
||||
};
|
||||
const mockResponse = createMockResponse({
|
||||
base_url: 'https://api.tavily.com',
|
||||
response_time: 1.2,
|
||||
results: [
|
||||
{
|
||||
url: 'https://example.com',
|
||||
raw_content: 'Test content with sufficient length. '.repeat(5),
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
|
||||
|
||||
await tavily('https://example.com', { filterOptions: {} });
|
||||
|
||||
expect(withTimeout).toHaveBeenCalledWith(expect.any(Promise), 30000);
|
||||
expect(withTimeout).toHaveBeenCalledWith(expect.any(Function), 30000);
|
||||
});
|
||||
|
||||
it('should return undefined when no results are returned', async () => {
|
||||
process.env.TAVILY_API_KEY = 'test-api-key';
|
||||
|
||||
const mockResponse = {
|
||||
ok: true,
|
||||
json: vi.fn().mockResolvedValue({
|
||||
base_url: 'https://api.tavily.com',
|
||||
response_time: 0.8,
|
||||
results: [],
|
||||
}),
|
||||
};
|
||||
const mockResponse = createMockResponse({
|
||||
base_url: 'https://api.tavily.com',
|
||||
response_time: 0.8,
|
||||
results: [],
|
||||
});
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
|
||||
|
|
@ -133,19 +122,16 @@ describe('tavily crawler', () => {
|
|||
it('should return undefined for short content', async () => {
|
||||
process.env.TAVILY_API_KEY = 'test-api-key';
|
||||
|
||||
const mockResponse = {
|
||||
ok: true,
|
||||
json: vi.fn().mockResolvedValue({
|
||||
base_url: 'https://api.tavily.com',
|
||||
response_time: 1.1,
|
||||
results: [
|
||||
{
|
||||
url: 'https://example.com',
|
||||
raw_content: 'Short', // Content too short
|
||||
},
|
||||
],
|
||||
}),
|
||||
};
|
||||
const mockResponse = createMockResponse({
|
||||
base_url: 'https://api.tavily.com',
|
||||
response_time: 1.1,
|
||||
results: [
|
||||
{
|
||||
url: 'https://example.com',
|
||||
raw_content: 'Short', // Content too short
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
|
||||
|
|
@ -158,20 +144,17 @@ describe('tavily crawler', () => {
|
|||
it('should return undefined when raw_content is missing', async () => {
|
||||
process.env.TAVILY_API_KEY = 'test-api-key';
|
||||
|
||||
const mockResponse = {
|
||||
ok: true,
|
||||
json: vi.fn().mockResolvedValue({
|
||||
base_url: 'https://api.tavily.com',
|
||||
response_time: 1,
|
||||
results: [
|
||||
{
|
||||
url: 'https://example.com',
|
||||
// raw_content is missing
|
||||
images: ['https://example.com/image.jpg'],
|
||||
},
|
||||
],
|
||||
}),
|
||||
};
|
||||
const mockResponse = createMockResponse({
|
||||
base_url: 'https://api.tavily.com',
|
||||
response_time: 1,
|
||||
results: [
|
||||
{
|
||||
url: 'https://example.com',
|
||||
// raw_content is missing
|
||||
images: ['https://example.com/image.jpg'],
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
|
||||
|
|
@ -184,11 +167,11 @@ describe('tavily crawler', () => {
|
|||
it('should throw PageNotFoundError for 404 status', async () => {
|
||||
process.env.TAVILY_API_KEY = 'test-api-key';
|
||||
|
||||
const mockResponse = {
|
||||
const mockResponse = createMockResponse('Not Found', {
|
||||
ok: false,
|
||||
status: 404,
|
||||
statusText: 'Not Found',
|
||||
};
|
||||
});
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
|
||||
|
|
@ -201,11 +184,11 @@ describe('tavily crawler', () => {
|
|||
it('should throw error for other HTTP errors', async () => {
|
||||
process.env.TAVILY_API_KEY = 'test-api-key';
|
||||
|
||||
const mockResponse = {
|
||||
const mockResponse = createMockResponse('', {
|
||||
ok: false,
|
||||
status: 500,
|
||||
statusText: 'Internal Server Error',
|
||||
};
|
||||
});
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
|
||||
|
|
@ -219,7 +202,7 @@ describe('tavily crawler', () => {
|
|||
process.env.TAVILY_API_KEY = 'test-api-key';
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockRejectedValue(new Error('fetch failed'));
|
||||
vi.mocked(withTimeout).mockRejectedValue(new TypeError('fetch failed'));
|
||||
|
||||
await expect(tavily('https://example.com', { filterOptions: {} })).rejects.toThrow(
|
||||
NetworkConnectionError,
|
||||
|
|
@ -252,43 +235,38 @@ describe('tavily crawler', () => {
|
|||
);
|
||||
});
|
||||
|
||||
it('should return undefined when JSON parsing fails', async () => {
|
||||
it('should throw ResponseBodyParseError when JSON parsing fails', async () => {
|
||||
process.env.TAVILY_API_KEY = 'test-api-key';
|
||||
|
||||
const mockResponse = {
|
||||
ok: true,
|
||||
const mockResponse = createMockResponse('not json', { ok: true });
|
||||
mockResponse.json = vi.fn().mockRejectedValue(new Error('Invalid JSON'));
|
||||
mockResponse.clone.mockReturnValue({
|
||||
...mockResponse,
|
||||
json: vi.fn().mockRejectedValue(new Error('Invalid JSON')),
|
||||
};
|
||||
text: vi.fn().mockResolvedValue('not json'),
|
||||
});
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
|
||||
|
||||
const consoleSpy = vi.spyOn(console, 'error').mockImplementation(() => {});
|
||||
|
||||
const result = await tavily('https://example.com', { filterOptions: {} });
|
||||
|
||||
expect(result).toBeUndefined();
|
||||
expect(consoleSpy).toHaveBeenCalled();
|
||||
|
||||
consoleSpy.mockRestore();
|
||||
await expect(tavily('https://example.com', { filterOptions: {} })).rejects.toThrow(
|
||||
'Tavily returned non-JSON response: not json',
|
||||
);
|
||||
});
|
||||
|
||||
it('should use result URL when available', async () => {
|
||||
process.env.TAVILY_API_KEY = 'test-api-key';
|
||||
|
||||
const mockResponse = {
|
||||
ok: true,
|
||||
json: vi.fn().mockResolvedValue({
|
||||
base_url: 'https://api.tavily.com',
|
||||
response_time: 1.3,
|
||||
results: [
|
||||
{
|
||||
url: 'https://redirected.example.com',
|
||||
raw_content: 'Test content with sufficient length. '.repeat(5),
|
||||
},
|
||||
],
|
||||
}),
|
||||
};
|
||||
const mockResponse = createMockResponse({
|
||||
base_url: 'https://api.tavily.com',
|
||||
response_time: 1.3,
|
||||
results: [
|
||||
{
|
||||
url: 'https://redirected.example.com',
|
||||
raw_content: 'Test content with sufficient length. '.repeat(5),
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
|
||||
|
|
@ -301,19 +279,16 @@ describe('tavily crawler', () => {
|
|||
it('should fallback to original URL when result URL is missing', async () => {
|
||||
process.env.TAVILY_API_KEY = 'test-api-key';
|
||||
|
||||
const mockResponse = {
|
||||
ok: true,
|
||||
json: vi.fn().mockResolvedValue({
|
||||
base_url: 'https://api.tavily.com',
|
||||
response_time: 1.4,
|
||||
results: [
|
||||
{
|
||||
raw_content: 'Test content with sufficient length. '.repeat(5),
|
||||
// url is missing
|
||||
},
|
||||
],
|
||||
}),
|
||||
};
|
||||
const mockResponse = createMockResponse({
|
||||
base_url: 'https://api.tavily.com',
|
||||
response_time: 1.4,
|
||||
results: [
|
||||
{
|
||||
raw_content: 'Test content with sufficient length. '.repeat(5),
|
||||
// url is missing
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
|
||||
|
|
@ -326,20 +301,17 @@ describe('tavily crawler', () => {
|
|||
it('should handle failed results in response', async () => {
|
||||
process.env.TAVILY_API_KEY = 'test-api-key';
|
||||
|
||||
const mockResponse = {
|
||||
ok: true,
|
||||
json: vi.fn().mockResolvedValue({
|
||||
base_url: 'https://api.tavily.com',
|
||||
response_time: 1.6,
|
||||
results: [],
|
||||
failed_results: [
|
||||
{
|
||||
url: 'https://example.com',
|
||||
error: 'Page not accessible',
|
||||
},
|
||||
],
|
||||
}),
|
||||
};
|
||||
const mockResponse = createMockResponse({
|
||||
base_url: 'https://api.tavily.com',
|
||||
response_time: 1.6,
|
||||
results: [],
|
||||
failed_results: [
|
||||
{
|
||||
url: 'https://example.com',
|
||||
error: 'Page not accessible',
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const { withTimeout } = await import('../../utils/withTimeout');
|
||||
vi.mocked(withTimeout).mockResolvedValue(mockResponse as any);
|
||||
|
|
|
|||
|
|
@ -2,7 +2,10 @@ import qs from 'query-string';
|
|||
import urlJoin from 'url-join';
|
||||
|
||||
import type { CrawlImpl, CrawlSuccessResult } from '../type';
|
||||
import { PageNotFoundError, toFetchError } from '../utils/errorType';
|
||||
import { htmlToMarkdown } from '../utils/htmlToMarkdown';
|
||||
import { createHTTPStatusError } from '../utils/response';
|
||||
import { DEFAULT_TIMEOUT, withTimeout } from '../utils/withTimeout';
|
||||
|
||||
const BASE_URL = process.env.BROWSERLESS_URL ?? 'https://chrome.browserless.io';
|
||||
// Allowed file types: html, css, js, json, xml, webmanifest, txt, md
|
||||
|
|
@ -31,46 +34,62 @@ export const browserless: CrawlImpl = async (url, { filterOptions }) => {
|
|||
url,
|
||||
};
|
||||
|
||||
let res: Response;
|
||||
|
||||
try {
|
||||
const res = await fetch(
|
||||
qs.stringifyUrl({
|
||||
query: {
|
||||
blockAds: BROWSERLESS_BLOCK_ADS,
|
||||
launch: JSON.stringify({ stealth: BROWSERLESS_STEALTH_MODE }),
|
||||
token: BROWSERLESS_TOKEN,
|
||||
},
|
||||
url: urlJoin(BASE_URL, '/content'),
|
||||
}),
|
||||
{
|
||||
body: JSON.stringify(input),
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
method: 'POST',
|
||||
},
|
||||
res = await withTimeout(
|
||||
(signal) =>
|
||||
fetch(
|
||||
qs.stringifyUrl({
|
||||
query: {
|
||||
blockAds: BROWSERLESS_BLOCK_ADS,
|
||||
launch: JSON.stringify({ stealth: BROWSERLESS_STEALTH_MODE }),
|
||||
token: BROWSERLESS_TOKEN,
|
||||
},
|
||||
url: urlJoin(BASE_URL, '/content'),
|
||||
}),
|
||||
{
|
||||
body: JSON.stringify(input),
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
method: 'POST',
|
||||
signal,
|
||||
},
|
||||
),
|
||||
DEFAULT_TIMEOUT,
|
||||
);
|
||||
const html = await res.text();
|
||||
} catch (e) {
|
||||
throw toFetchError(e);
|
||||
}
|
||||
|
||||
const result = htmlToMarkdown(html, { filterOptions, url });
|
||||
|
||||
if (
|
||||
!!result.content &&
|
||||
result.title &&
|
||||
// "Just a moment..." indicates being blocked by CloudFlare
|
||||
result.title.trim() !== 'Just a moment...'
|
||||
) {
|
||||
return {
|
||||
content: result.content,
|
||||
contentType: 'text',
|
||||
description: result?.description,
|
||||
length: result.length,
|
||||
siteName: result?.siteName,
|
||||
title: result?.title,
|
||||
url,
|
||||
} satisfies CrawlSuccessResult;
|
||||
if (!res.ok) {
|
||||
if (res.status === 404) {
|
||||
throw new PageNotFoundError(res.statusText);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
|
||||
throw await createHTTPStatusError(res, 'Browserless');
|
||||
}
|
||||
|
||||
const html = await res.text();
|
||||
const result = htmlToMarkdown(html, { filterOptions, url });
|
||||
|
||||
if (
|
||||
!!result.content &&
|
||||
result.content.length > 100 &&
|
||||
result.title &&
|
||||
// "Just a moment..." indicates being blocked by CloudFlare
|
||||
result.title.trim() !== 'Just a moment...'
|
||||
) {
|
||||
return {
|
||||
content: result.content,
|
||||
contentType: 'text',
|
||||
description: result?.description,
|
||||
length: result.length,
|
||||
siteName: result?.siteName,
|
||||
title: result?.title,
|
||||
url,
|
||||
} satisfies CrawlSuccessResult;
|
||||
}
|
||||
|
||||
return;
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
import type { CrawlImpl, CrawlSuccessResult } from '../type';
|
||||
import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../utils/errorType';
|
||||
import { PageNotFoundError, toFetchError } from '../utils/errorType';
|
||||
import { createHTTPStatusError, parseJSONResponse } from '../utils/response';
|
||||
import { DEFAULT_TIMEOUT, withTimeout } from '../utils/withTimeout';
|
||||
|
||||
interface ExaResults {
|
||||
|
|
@ -27,31 +28,24 @@ export const exa: CrawlImpl = async (url) => {
|
|||
|
||||
try {
|
||||
res = await withTimeout(
|
||||
fetch('https://api.exa.ai/contents', {
|
||||
body: JSON.stringify({
|
||||
livecrawl: 'fallback', // always, fallback
|
||||
text: true,
|
||||
urls: [url],
|
||||
(signal) =>
|
||||
fetch('https://api.exa.ai/contents', {
|
||||
body: JSON.stringify({
|
||||
livecrawl: 'fallback', // always, fallback
|
||||
text: true,
|
||||
urls: [url],
|
||||
}),
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'x-api-key': !apiKey ? '' : apiKey,
|
||||
},
|
||||
method: 'POST',
|
||||
signal,
|
||||
}),
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'x-api-key': !apiKey ? '' : apiKey,
|
||||
},
|
||||
method: 'POST',
|
||||
}),
|
||||
DEFAULT_TIMEOUT,
|
||||
);
|
||||
} catch (e) {
|
||||
const error = e as Error;
|
||||
if (error.message === 'fetch failed') {
|
||||
throw new NetworkConnectionError();
|
||||
}
|
||||
|
||||
if (error instanceof TimeoutError) {
|
||||
throw error;
|
||||
}
|
||||
|
||||
throw e;
|
||||
throw toFetchError(e);
|
||||
}
|
||||
|
||||
if (!res.ok) {
|
||||
|
|
@ -59,35 +53,29 @@ export const exa: CrawlImpl = async (url) => {
|
|||
throw new PageNotFoundError(res.statusText);
|
||||
}
|
||||
|
||||
throw new Error(`Exa request failed with status ${res.status}: ${res.statusText}`);
|
||||
throw await createHTTPStatusError(res, 'Exa');
|
||||
}
|
||||
|
||||
try {
|
||||
const data = (await res.json()) as ExaResponse;
|
||||
const data = await parseJSONResponse<ExaResponse>(res, 'Exa');
|
||||
|
||||
if (!data.results || data.results.length === 0) {
|
||||
console.warn('Exa API returned no results for URL:', url);
|
||||
return;
|
||||
}
|
||||
|
||||
const firstResult = data.results[0];
|
||||
|
||||
// Check if content is empty or too short
|
||||
if (!firstResult.text || firstResult.text.length < 100) {
|
||||
return;
|
||||
}
|
||||
|
||||
return {
|
||||
content: firstResult.text,
|
||||
contentType: 'text',
|
||||
length: firstResult.text.length,
|
||||
siteName: new URL(url).hostname,
|
||||
title: firstResult.title,
|
||||
url: firstResult.url || url,
|
||||
} satisfies CrawlSuccessResult;
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
if (!data.results || data.results.length === 0) {
|
||||
console.warn('Exa API returned no results for URL:', url);
|
||||
return;
|
||||
}
|
||||
|
||||
return;
|
||||
const firstResult = data.results[0];
|
||||
|
||||
// Check if content is empty or too short
|
||||
if (!firstResult.text || firstResult.text.length < 100) {
|
||||
return;
|
||||
}
|
||||
|
||||
return {
|
||||
content: firstResult.text,
|
||||
contentType: 'text',
|
||||
length: firstResult.text.length,
|
||||
siteName: new URL(url).hostname,
|
||||
title: firstResult.title,
|
||||
url: firstResult.url || url,
|
||||
} satisfies CrawlSuccessResult;
|
||||
};
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
import type { CrawlImpl, CrawlSuccessResult } from '../type';
|
||||
import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../utils/errorType';
|
||||
import { PageNotFoundError, toFetchError } from '../utils/errorType';
|
||||
import { createHTTPStatusError, parseJSONResponse } from '../utils/response';
|
||||
import { DEFAULT_TIMEOUT, withTimeout } from '../utils/withTimeout';
|
||||
|
||||
interface FirecrawlMetadata {
|
||||
|
|
@ -57,30 +58,23 @@ export const firecrawl: CrawlImpl = async (url) => {
|
|||
|
||||
try {
|
||||
res = await withTimeout(
|
||||
fetch(`${baseUrl}/scrape`, {
|
||||
body: JSON.stringify({
|
||||
formats: ['markdown'], // ["markdown", "html"]
|
||||
url,
|
||||
(signal) =>
|
||||
fetch(`${baseUrl}/scrape`, {
|
||||
body: JSON.stringify({
|
||||
formats: ['markdown'], // ["markdown", "html"]
|
||||
url,
|
||||
}),
|
||||
headers: {
|
||||
'Authorization': !apiKey ? '' : `Bearer ${apiKey}`,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
method: 'POST',
|
||||
signal,
|
||||
}),
|
||||
headers: {
|
||||
'Authorization': !apiKey ? '' : `Bearer ${apiKey}`,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
method: 'POST',
|
||||
}),
|
||||
DEFAULT_TIMEOUT,
|
||||
);
|
||||
} catch (e) {
|
||||
const error = e as Error;
|
||||
if (error.message === 'fetch failed') {
|
||||
throw new NetworkConnectionError();
|
||||
}
|
||||
|
||||
if (error instanceof TimeoutError) {
|
||||
throw error;
|
||||
}
|
||||
|
||||
throw e;
|
||||
throw toFetchError(e);
|
||||
}
|
||||
|
||||
if (!res.ok) {
|
||||
|
|
@ -88,37 +82,34 @@ export const firecrawl: CrawlImpl = async (url) => {
|
|||
throw new PageNotFoundError(res.statusText);
|
||||
}
|
||||
|
||||
throw new Error(`Firecrawl request failed with status ${res.status}: ${res.statusText}`);
|
||||
throw await createHTTPStatusError(res, 'Firecrawl');
|
||||
}
|
||||
|
||||
try {
|
||||
const data = (await res.json()) as FirecrawlResponse;
|
||||
|
||||
if (data.data.warning) {
|
||||
console.warn('[Firecrawl] Warning:', data.data.warning);
|
||||
}
|
||||
|
||||
if (data.data.metadata.error) {
|
||||
console.error('[Firecrawl] Metadata error:', data.data.metadata.error);
|
||||
}
|
||||
|
||||
// Check if content is empty or too short
|
||||
if (!data.data.markdown || data.data.markdown.length < 100) {
|
||||
return;
|
||||
}
|
||||
|
||||
return {
|
||||
content: data.data.markdown,
|
||||
contentType: 'text',
|
||||
description: data.data.metadata.description || '',
|
||||
length: data.data.markdown.length,
|
||||
siteName: new URL(url).hostname,
|
||||
title: data.data.metadata.title || '',
|
||||
url: url,
|
||||
} satisfies CrawlSuccessResult;
|
||||
} catch (error) {
|
||||
console.error('[Firecrawl] Parse error:', error);
|
||||
const data = await parseJSONResponse<FirecrawlResponse>(res, 'Firecrawl');
|
||||
if (!data.data) {
|
||||
throw new Error('Firecrawl response missing data field');
|
||||
}
|
||||
|
||||
return;
|
||||
if (data.data.warning) {
|
||||
console.warn('[Firecrawl] Warning:', data.data.warning);
|
||||
}
|
||||
|
||||
if (data.data.metadata.error) {
|
||||
console.error('[Firecrawl] Metadata error:', data.data.metadata.error);
|
||||
}
|
||||
|
||||
// Check if content is empty or too short
|
||||
if (!data.data.markdown || data.data.markdown.length < 100) {
|
||||
return;
|
||||
}
|
||||
|
||||
return {
|
||||
content: data.data.markdown,
|
||||
contentType: 'text',
|
||||
description: data.data.metadata.description || '',
|
||||
length: data.data.markdown.length,
|
||||
siteName: new URL(url).hostname,
|
||||
title: data.data.metadata.title || '',
|
||||
url,
|
||||
} satisfies CrawlSuccessResult;
|
||||
};
|
||||
|
|
|
|||
|
|
@ -1,37 +1,59 @@
|
|||
import type { CrawlImpl } from '../type';
|
||||
import { toFetchError } from '../utils/errorType';
|
||||
import { parseJSONResponse } from '../utils/response';
|
||||
import { DEFAULT_TIMEOUT, withTimeout } from '../utils/withTimeout';
|
||||
|
||||
export const jina: CrawlImpl<{ apiKey?: string }> = async (url, params) => {
|
||||
const token = params.apiKey ?? process.env.JINA_READER_API_KEY ?? process.env.JINA_API_KEY;
|
||||
let res: Response;
|
||||
|
||||
try {
|
||||
const res = await fetch(`https://r.jina.ai/${url}`, {
|
||||
headers: {
|
||||
'Accept': 'application/json',
|
||||
'Authorization': token ? `Bearer ${token}` : '',
|
||||
'x-send-from': 'LobeChat Community',
|
||||
},
|
||||
});
|
||||
|
||||
if (res.ok) {
|
||||
const json = await res.json();
|
||||
if (json.code === 200) {
|
||||
const result = json.data;
|
||||
return {
|
||||
content: result.content,
|
||||
contentType: 'text',
|
||||
description: result?.description,
|
||||
length: result.content.length,
|
||||
siteName: result?.siteName,
|
||||
title: result?.title,
|
||||
url: url,
|
||||
};
|
||||
}
|
||||
|
||||
throw json;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
res = await withTimeout(
|
||||
(signal) =>
|
||||
fetch(`https://r.jina.ai/${url}`, {
|
||||
headers: {
|
||||
'Accept': 'application/json',
|
||||
'Authorization': token ? `Bearer ${token}` : '',
|
||||
'x-send-from': 'LobeChat Community',
|
||||
},
|
||||
signal,
|
||||
}),
|
||||
DEFAULT_TIMEOUT,
|
||||
);
|
||||
} catch (e) {
|
||||
throw toFetchError(e);
|
||||
}
|
||||
|
||||
return;
|
||||
if (!res.ok) {
|
||||
return;
|
||||
}
|
||||
|
||||
const json = await parseJSONResponse<{
|
||||
code: number;
|
||||
data: {
|
||||
content: string;
|
||||
description?: string;
|
||||
siteName?: string;
|
||||
title?: string;
|
||||
};
|
||||
}>(res, 'Jina');
|
||||
|
||||
if (json.code !== 200) {
|
||||
return;
|
||||
}
|
||||
|
||||
const result = json.data;
|
||||
if (!result?.content || result.content.length < 100) {
|
||||
return;
|
||||
}
|
||||
|
||||
return {
|
||||
content: result.content,
|
||||
contentType: 'text',
|
||||
description: result?.description,
|
||||
length: result.content.length,
|
||||
siteName: result?.siteName,
|
||||
title: result?.title,
|
||||
url,
|
||||
};
|
||||
};
|
||||
|
|
|
|||
|
|
@ -1,8 +1,9 @@
|
|||
import { ssrfSafeFetch } from '@lobechat/ssrf-safe-fetch';
|
||||
|
||||
import type { CrawlImpl, CrawlSuccessResult } from '../type';
|
||||
import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../utils/errorType';
|
||||
import { PageNotFoundError, toFetchError } from '../utils/errorType';
|
||||
import { htmlToMarkdown } from '../utils/htmlToMarkdown';
|
||||
import { createHTTPStatusError } from '../utils/response';
|
||||
import { DEFAULT_TIMEOUT, withTimeout } from '../utils/withTimeout';
|
||||
|
||||
const mixinHeaders = {
|
||||
|
|
@ -39,28 +40,25 @@ export const naive: CrawlImpl = async (url, { filterOptions }) => {
|
|||
|
||||
try {
|
||||
res = await withTimeout(
|
||||
ssrfSafeFetch(url, {
|
||||
headers: mixinHeaders,
|
||||
signal: new AbortController().signal,
|
||||
}),
|
||||
(signal) =>
|
||||
ssrfSafeFetch(url, {
|
||||
headers: mixinHeaders,
|
||||
signal,
|
||||
}),
|
||||
DEFAULT_TIMEOUT,
|
||||
);
|
||||
} catch (e) {
|
||||
const error = e as Error;
|
||||
if (error.message === 'fetch failed') {
|
||||
throw new NetworkConnectionError();
|
||||
}
|
||||
|
||||
if (error instanceof TimeoutError) {
|
||||
throw error;
|
||||
}
|
||||
|
||||
throw e;
|
||||
throw toFetchError(e);
|
||||
}
|
||||
|
||||
if (res.status === 404) {
|
||||
throw new PageNotFoundError(res.statusText);
|
||||
}
|
||||
|
||||
if (!res.ok) {
|
||||
throw await createHTTPStatusError(res, 'Naive');
|
||||
}
|
||||
|
||||
const type = res.headers.get('content-type');
|
||||
|
||||
if (type?.includes('application/json')) {
|
||||
|
|
@ -74,7 +72,7 @@ export const naive: CrawlImpl = async (url, { filterOptions }) => {
|
|||
}
|
||||
|
||||
return {
|
||||
content: content,
|
||||
content,
|
||||
contentType: 'json',
|
||||
length: content.length,
|
||||
url,
|
||||
|
|
@ -91,8 +89,8 @@ export const naive: CrawlImpl = async (url, { filterOptions }) => {
|
|||
return;
|
||||
}
|
||||
|
||||
// it's blocked by cloudflare
|
||||
if (result.title !== 'Just a moment...') {
|
||||
// It's blocked by Cloudflare.
|
||||
if (result.title === 'Just a moment...') {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
import type { CrawlImpl, CrawlSuccessResult } from '../type';
|
||||
import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../utils/errorType';
|
||||
import { PageNotFoundError, toFetchError } from '../utils/errorType';
|
||||
import { createHTTPStatusError, parseJSONResponse } from '../utils/response';
|
||||
import { DEFAULT_TIMEOUT, withTimeout } from '../utils/withTimeout';
|
||||
|
||||
interface Search1ApiResponse {
|
||||
|
|
@ -21,29 +22,22 @@ export const search1api: CrawlImpl = async (url) => {
|
|||
|
||||
try {
|
||||
res = await withTimeout(
|
||||
fetch('https://api.search1api.com/crawl', {
|
||||
body: JSON.stringify({
|
||||
url,
|
||||
(signal) =>
|
||||
fetch('https://api.search1api.com/crawl', {
|
||||
body: JSON.stringify({
|
||||
url,
|
||||
}),
|
||||
headers: {
|
||||
'Authorization': !apiKey ? '' : `Bearer ${apiKey}`,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
method: 'POST',
|
||||
signal,
|
||||
}),
|
||||
headers: {
|
||||
'Authorization': !apiKey ? '' : `Bearer ${apiKey}`,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
method: 'POST',
|
||||
}),
|
||||
DEFAULT_TIMEOUT,
|
||||
);
|
||||
} catch (e) {
|
||||
const error = e as Error;
|
||||
if (error.message === 'fetch failed') {
|
||||
throw new NetworkConnectionError();
|
||||
}
|
||||
|
||||
if (error instanceof TimeoutError) {
|
||||
throw error;
|
||||
}
|
||||
|
||||
throw e;
|
||||
throw toFetchError(e);
|
||||
}
|
||||
|
||||
if (!res.ok) {
|
||||
|
|
@ -51,30 +45,24 @@ export const search1api: CrawlImpl = async (url) => {
|
|||
throw new PageNotFoundError(res.statusText);
|
||||
}
|
||||
|
||||
throw new Error(`Search1API request failed with status ${res.status}: ${res.statusText}`);
|
||||
throw await createHTTPStatusError(res, 'Search1API');
|
||||
}
|
||||
|
||||
try {
|
||||
const data = (await res.json()) as Search1ApiResponse;
|
||||
const data = await parseJSONResponse<Search1ApiResponse>(res, 'Search1API');
|
||||
|
||||
// Check if content is empty or too short
|
||||
if (!data.results.content || data.results.content.length < 100) {
|
||||
return;
|
||||
}
|
||||
|
||||
return {
|
||||
content: data.results.content,
|
||||
contentType: 'text',
|
||||
description: data.results.title,
|
||||
// Using title as description since API doesn't provide a separate description
|
||||
length: data.results.content.length,
|
||||
siteName: new URL(url).hostname,
|
||||
title: data.results.title,
|
||||
url: data.results.link || url,
|
||||
} satisfies CrawlSuccessResult;
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
// Check if content is empty or too short
|
||||
if (!data.results?.content || data.results.content.length < 100) {
|
||||
return;
|
||||
}
|
||||
|
||||
return;
|
||||
return {
|
||||
content: data.results.content,
|
||||
contentType: 'text',
|
||||
description: data.results?.title,
|
||||
// Using title as description since API doesn't provide a separate description
|
||||
length: data.results.content.length,
|
||||
siteName: new URL(url).hostname,
|
||||
title: data.results?.title,
|
||||
url: data.results?.link || url,
|
||||
} satisfies CrawlSuccessResult;
|
||||
};
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
import type { CrawlImpl, CrawlSuccessResult } from '../type';
|
||||
import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../utils/errorType';
|
||||
import { PageNotFoundError, toFetchError } from '../utils/errorType';
|
||||
import { createHTTPStatusError, parseJSONResponse } from '../utils/response';
|
||||
import { DEFAULT_TIMEOUT, withTimeout } from '../utils/withTimeout';
|
||||
|
||||
interface TavilyResults {
|
||||
|
|
@ -28,31 +29,24 @@ export const tavily: CrawlImpl = async (url) => {
|
|||
|
||||
try {
|
||||
res = await withTimeout(
|
||||
fetch('https://api.tavily.com/extract', {
|
||||
body: JSON.stringify({
|
||||
extract_depth: process.env.TAVILY_EXTRACT_DEPTH || 'basic', // basic or advanced
|
||||
include_images: false,
|
||||
urls: url,
|
||||
(signal) =>
|
||||
fetch('https://api.tavily.com/extract', {
|
||||
body: JSON.stringify({
|
||||
extract_depth: process.env.TAVILY_EXTRACT_DEPTH || 'basic', // basic or advanced
|
||||
include_images: false,
|
||||
urls: url,
|
||||
}),
|
||||
headers: {
|
||||
'Authorization': !apiKey ? '' : `Bearer ${apiKey}`,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
method: 'POST',
|
||||
signal,
|
||||
}),
|
||||
headers: {
|
||||
'Authorization': !apiKey ? '' : `Bearer ${apiKey}`,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
method: 'POST',
|
||||
}),
|
||||
DEFAULT_TIMEOUT,
|
||||
);
|
||||
} catch (e) {
|
||||
const error = e as Error;
|
||||
if (error.message === 'fetch failed') {
|
||||
throw new NetworkConnectionError();
|
||||
}
|
||||
|
||||
if (error instanceof TimeoutError) {
|
||||
throw error;
|
||||
}
|
||||
|
||||
throw e;
|
||||
throw toFetchError(e);
|
||||
}
|
||||
|
||||
if (!res.ok) {
|
||||
|
|
@ -60,35 +54,29 @@ export const tavily: CrawlImpl = async (url) => {
|
|||
throw new PageNotFoundError(res.statusText);
|
||||
}
|
||||
|
||||
throw new Error(`Tavily request failed with status ${res.status}: ${res.statusText}`);
|
||||
throw await createHTTPStatusError(res, 'Tavily');
|
||||
}
|
||||
|
||||
try {
|
||||
const data = (await res.json()) as TavilyResponse;
|
||||
const data = await parseJSONResponse<TavilyResponse>(res, 'Tavily');
|
||||
|
||||
if (!data.results || data.results.length === 0) {
|
||||
console.warn('Tavily API returned no results for URL:', url);
|
||||
return;
|
||||
}
|
||||
|
||||
const firstResult = data.results[0];
|
||||
|
||||
// Check if content is empty or too short
|
||||
if (!firstResult.raw_content || firstResult.raw_content.length < 100) {
|
||||
return;
|
||||
}
|
||||
|
||||
return {
|
||||
content: firstResult.raw_content,
|
||||
contentType: 'text',
|
||||
length: firstResult.raw_content.length,
|
||||
siteName: new URL(url).hostname,
|
||||
title: new URL(url).hostname,
|
||||
url: firstResult.url || url,
|
||||
} satisfies CrawlSuccessResult;
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
if (!data.results || data.results.length === 0) {
|
||||
console.warn('Tavily API returned no results for URL:', url);
|
||||
return;
|
||||
}
|
||||
|
||||
return;
|
||||
const firstResult = data.results[0];
|
||||
|
||||
// Check if content is empty or too short
|
||||
if (!firstResult.raw_content || firstResult.raw_content.length < 100) {
|
||||
return;
|
||||
}
|
||||
|
||||
return {
|
||||
content: firstResult.raw_content,
|
||||
contentType: 'text',
|
||||
length: firstResult.raw_content.length,
|
||||
siteName: new URL(url).hostname,
|
||||
title: new URL(url).hostname,
|
||||
url: firstResult.url || url,
|
||||
} satisfies CrawlSuccessResult;
|
||||
};
|
||||
|
|
|
|||
|
|
@ -59,13 +59,18 @@ export class Crawler {
|
|||
try {
|
||||
const res = await crawlImpls[impl](transformedUrl, { filterOptions: mergedFilterOptions });
|
||||
|
||||
if (res && res.content && res.content?.length > 100)
|
||||
if (res && res.content && res.content.length > 100) {
|
||||
return {
|
||||
crawler: impl,
|
||||
data: res,
|
||||
originalUrl: url,
|
||||
transformedUrl: transformedUrl !== url ? transformedUrl : undefined,
|
||||
};
|
||||
}
|
||||
|
||||
finalError = new Error(`${impl} returned empty or short content`);
|
||||
finalError.name = 'EmptyCrawlResultError';
|
||||
finalCrawler = impl;
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
finalError = error as Error;
|
||||
|
|
@ -77,10 +82,10 @@ export class Crawler {
|
|||
const errorMessage = finalError?.message;
|
||||
|
||||
return {
|
||||
crawler: finalCrawler!,
|
||||
crawler: finalCrawler || finalImpls.at(-1) || 'unknown',
|
||||
data: {
|
||||
content: `Fail to crawl the page. Error type: ${errorType}, error message: ${errorMessage}`,
|
||||
errorMessage: errorMessage,
|
||||
errorMessage,
|
||||
errorType,
|
||||
},
|
||||
originalUrl: url,
|
||||
|
|
|
|||
25
packages/web-crawler/src/test-utils.ts
Normal file
25
packages/web-crawler/src/test-utils.ts
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
import { vi } from 'vitest';
|
||||
|
||||
/**
|
||||
* Create a mock Response object for crawler tests.
|
||||
* Uses `vi.fn()` for `json`, `text`, and `clone` so individual tests can override them.
|
||||
*/
|
||||
export const createMockResponse = (
|
||||
body: any,
|
||||
opts: { ok: boolean; status?: number; statusText?: string } = { ok: true },
|
||||
) => {
|
||||
const self: any = {
|
||||
ok: opts.ok,
|
||||
status: opts.status ?? (opts.ok ? 200 : 500),
|
||||
statusText: opts.statusText ?? (opts.ok ? 'OK' : 'Internal Server Error'),
|
||||
json: vi.fn().mockResolvedValue(body),
|
||||
text: vi.fn().mockResolvedValue(typeof body === 'string' ? body : JSON.stringify(body)),
|
||||
clone: vi.fn(),
|
||||
};
|
||||
self.clone.mockReturnValue({
|
||||
...self,
|
||||
json: vi.fn().mockResolvedValue(body),
|
||||
text: vi.fn().mockResolvedValue(typeof body === 'string' ? body : JSON.stringify(body)),
|
||||
});
|
||||
return self;
|
||||
};
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
|
||||
import { applyUrlRules } from './appUrlRules';
|
||||
import { applyUrlRules } from '../appUrlRules';
|
||||
|
||||
describe('applyUrlRules', () => {
|
||||
// @gru-agent github file rules 不要改
|
||||
|
|
@ -1,6 +1,12 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
|
||||
import { NetworkConnectionError, PageNotFoundError, TimeoutError } from '../errorType';
|
||||
import {
|
||||
isFetchNetworkError,
|
||||
NetworkConnectionError,
|
||||
PageNotFoundError,
|
||||
TimeoutError,
|
||||
toFetchError,
|
||||
} from '../errorType';
|
||||
|
||||
describe('errorType', () => {
|
||||
describe('PageNotFoundError', () => {
|
||||
|
|
@ -170,6 +176,43 @@ describe('errorType', () => {
|
|||
});
|
||||
});
|
||||
|
||||
describe('isFetchNetworkError', () => {
|
||||
it('should return true for TypeError with "fetch failed" message', () => {
|
||||
expect(isFetchNetworkError(new TypeError('fetch failed'))).toBe(true);
|
||||
});
|
||||
|
||||
it('should return false for plain Error with "fetch failed" message', () => {
|
||||
expect(isFetchNetworkError(new Error('fetch failed'))).toBe(false);
|
||||
});
|
||||
|
||||
it('should return false for TypeError with different message', () => {
|
||||
expect(isFetchNetworkError(new TypeError('something else'))).toBe(false);
|
||||
});
|
||||
|
||||
it('should return false for non-error values', () => {
|
||||
expect(isFetchNetworkError('fetch failed')).toBe(false);
|
||||
expect(isFetchNetworkError(null)).toBe(false);
|
||||
expect(isFetchNetworkError(undefined)).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe('toFetchError', () => {
|
||||
it('should return NetworkConnectionError for fetch network errors', () => {
|
||||
const result = toFetchError(new TypeError('fetch failed'));
|
||||
expect(result).toBeInstanceOf(NetworkConnectionError);
|
||||
});
|
||||
|
||||
it('should return TimeoutError as-is', () => {
|
||||
const timeout = new TimeoutError('Request timeout after 10000ms');
|
||||
expect(toFetchError(timeout)).toBe(timeout);
|
||||
});
|
||||
|
||||
it('should return unknown errors unchanged', () => {
|
||||
const unknown = new Error('something unexpected');
|
||||
expect(toFetchError(unknown)).toBe(unknown);
|
||||
});
|
||||
});
|
||||
|
||||
describe('error catching scenarios', () => {
|
||||
it('should allow catching specific error types', () => {
|
||||
const testErrors = [
|
||||
|
|
|
|||
102
packages/web-crawler/src/utils/__tests__/response.test.ts
Normal file
102
packages/web-crawler/src/utils/__tests__/response.test.ts
Normal file
|
|
@ -0,0 +1,102 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
|
||||
import { createHTTPStatusError, parseJSONResponse, ResponseBodyParseError } from '../response';
|
||||
|
||||
const createMockResponse = (
|
||||
body: string,
|
||||
options: { ok?: boolean; status?: number; statusText?: string } = {},
|
||||
) => {
|
||||
const { ok = true, status = 200, statusText = 'OK' } = options;
|
||||
return new Response(body, {
|
||||
status,
|
||||
statusText,
|
||||
headers: { 'Content-Type': ok ? 'application/json' : 'text/html' },
|
||||
});
|
||||
};
|
||||
|
||||
describe('ResponseBodyParseError', () => {
|
||||
it('should create error with provider and body snippet', () => {
|
||||
const error = new ResponseBodyParseError('Jina', '<html>error</html>');
|
||||
expect(error.message).toBe('Jina returned non-JSON response: <html>error</html>');
|
||||
expect(error.name).toBe('ResponseBodyParseError');
|
||||
});
|
||||
|
||||
it('should create error without body snippet', () => {
|
||||
const error = new ResponseBodyParseError('Firecrawl');
|
||||
expect(error.message).toBe('Firecrawl returned non-JSON response');
|
||||
});
|
||||
});
|
||||
|
||||
describe('parseJSONResponse', () => {
|
||||
it('should parse valid JSON response', async () => {
|
||||
const data = { code: 200, results: ['a', 'b'] };
|
||||
const response = createMockResponse(JSON.stringify(data));
|
||||
|
||||
const result = await parseJSONResponse<typeof data>(response, 'TestProvider');
|
||||
|
||||
expect(result).toEqual(data);
|
||||
});
|
||||
|
||||
it('should throw ResponseBodyParseError for non-JSON response', async () => {
|
||||
const response = createMockResponse('<html><body>Error</body></html>');
|
||||
|
||||
await expect(parseJSONResponse(response, 'Jina')).rejects.toThrow(ResponseBodyParseError);
|
||||
await expect(
|
||||
parseJSONResponse(createMockResponse('<html><body>Error</body></html>'), 'Jina'),
|
||||
).rejects.toThrow('Jina returned non-JSON response');
|
||||
});
|
||||
|
||||
it('should include body snippet in error for non-JSON response', async () => {
|
||||
const htmlBody = '<html><body>Internal Server Error</body></html>';
|
||||
const response = createMockResponse(htmlBody);
|
||||
|
||||
await expect(parseJSONResponse(response, 'Firecrawl')).rejects.toThrow(
|
||||
/Firecrawl returned non-JSON response: .*Internal Server Error/,
|
||||
);
|
||||
});
|
||||
|
||||
it('should handle empty response body', async () => {
|
||||
const response = createMockResponse('');
|
||||
|
||||
await expect(parseJSONResponse(response, 'TestProvider')).rejects.toThrow(
|
||||
'TestProvider returned non-JSON response',
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe('createHTTPStatusError', () => {
|
||||
it('should create error with status and body snippet', async () => {
|
||||
const response = createMockResponse('Not Found', {
|
||||
ok: false,
|
||||
status: 404,
|
||||
statusText: 'Not Found',
|
||||
});
|
||||
|
||||
const error = await createHTTPStatusError(response, 'Exa');
|
||||
|
||||
expect(error.message).toContain('Exa request failed with status 404: Not Found');
|
||||
expect(error.message).toContain('Not Found');
|
||||
});
|
||||
|
||||
it('should create error without body when response text fails', async () => {
|
||||
const response = createMockResponse('', {
|
||||
ok: false,
|
||||
status: 500,
|
||||
statusText: 'Internal Server Error',
|
||||
});
|
||||
|
||||
const error = await createHTTPStatusError(response, 'Tavily');
|
||||
|
||||
expect(error.message).toBe('Tavily request failed with status 500: Internal Server Error');
|
||||
});
|
||||
|
||||
it('should truncate long body snippets', async () => {
|
||||
const longBody = 'x'.repeat(500);
|
||||
const response = createMockResponse(longBody, { ok: false, status: 500, statusText: 'Error' });
|
||||
|
||||
const error = await createHTTPStatusError(response, 'Test');
|
||||
|
||||
// Body snippet should be truncated to 200 chars
|
||||
expect(error.message.length).toBeLessThan(500 + 100);
|
||||
});
|
||||
});
|
||||
|
|
@ -12,18 +12,18 @@ describe('withTimeout', () => {
|
|||
vi.useRealTimers();
|
||||
});
|
||||
|
||||
it('should resolve when promise resolves before timeout', async () => {
|
||||
const promise = Promise.resolve('success');
|
||||
const result = await withTimeout(promise, 1000);
|
||||
it('should resolve when factory function resolves before timeout', async () => {
|
||||
const result = await withTimeout(() => Promise.resolve('success'), 1000);
|
||||
expect(result).toBe('success');
|
||||
});
|
||||
|
||||
it('should reject with TimeoutError when promise takes too long', async () => {
|
||||
const slowPromise = new Promise((resolve) => {
|
||||
setTimeout(() => resolve('too late'), 200);
|
||||
});
|
||||
it('should reject with TimeoutError when factory takes too long', async () => {
|
||||
const fn = () =>
|
||||
new Promise((resolve) => {
|
||||
setTimeout(() => resolve('too late'), 200);
|
||||
});
|
||||
|
||||
const timeoutPromise = withTimeout(slowPromise, 100);
|
||||
const timeoutPromise = withTimeout(fn, 100);
|
||||
vi.advanceTimersByTime(100);
|
||||
|
||||
await expect(timeoutPromise).rejects.toThrow(TimeoutError);
|
||||
|
|
@ -31,32 +31,70 @@ describe('withTimeout', () => {
|
|||
});
|
||||
|
||||
it('should use DEFAULT_TIMEOUT when no timeout specified', async () => {
|
||||
const slowPromise = new Promise((resolve) => {
|
||||
setTimeout(() => resolve('success'), DEFAULT_TIMEOUT + 100);
|
||||
});
|
||||
const fn = () =>
|
||||
new Promise((resolve) => {
|
||||
setTimeout(() => resolve('success'), DEFAULT_TIMEOUT + 100);
|
||||
});
|
||||
|
||||
const timeoutPromise = withTimeout(slowPromise);
|
||||
const timeoutPromise = withTimeout(fn);
|
||||
vi.advanceTimersByTime(DEFAULT_TIMEOUT);
|
||||
|
||||
await expect(timeoutPromise).rejects.toThrow(TimeoutError);
|
||||
await expect(timeoutPromise).rejects.toThrow(`Request timeout after ${DEFAULT_TIMEOUT}ms`);
|
||||
});
|
||||
|
||||
it('should reject with original error if promise rejects before timeout', async () => {
|
||||
it('should reject with original error if factory rejects before timeout', async () => {
|
||||
const error = new Error('Original error');
|
||||
const failingPromise = Promise.reject(error);
|
||||
const fn = () => Promise.reject(error);
|
||||
|
||||
await expect(withTimeout(failingPromise, 1000)).rejects.toThrow('Original error');
|
||||
await expect(withTimeout(fn, 1000)).rejects.toThrow('Original error');
|
||||
});
|
||||
|
||||
it('should abort controller when timeout occurs', async () => {
|
||||
const slowPromise = new Promise((resolve) => {
|
||||
setTimeout(() => resolve('too late'), 2000);
|
||||
});
|
||||
it('should pass AbortSignal to the factory function', async () => {
|
||||
const factoryFn = vi.fn().mockResolvedValue('result');
|
||||
await withTimeout(factoryFn, 1000);
|
||||
|
||||
const timeoutPromise = withTimeout(slowPromise, 1000);
|
||||
vi.advanceTimersByTime(1000);
|
||||
expect(factoryFn).toHaveBeenCalledTimes(1);
|
||||
const signal = factoryFn.mock.calls[0][0];
|
||||
expect(signal).toBeInstanceOf(AbortSignal);
|
||||
expect(signal.aborted).toBe(false);
|
||||
});
|
||||
|
||||
it('should abort the signal when timeout occurs', async () => {
|
||||
let capturedSignal: AbortSignal | undefined;
|
||||
const fn = (signal: AbortSignal) => {
|
||||
capturedSignal = signal;
|
||||
return new Promise((resolve) => {
|
||||
setTimeout(() => resolve('too late'), 2000);
|
||||
});
|
||||
};
|
||||
|
||||
const timeoutPromise = withTimeout(fn, 100);
|
||||
expect(capturedSignal!.aborted).toBe(false);
|
||||
|
||||
vi.advanceTimersByTime(100);
|
||||
await expect(timeoutPromise).rejects.toThrow(TimeoutError);
|
||||
|
||||
expect(capturedSignal!.aborted).toBe(true);
|
||||
});
|
||||
|
||||
it('should clear timeout timer when promise resolves successfully', async () => {
|
||||
const clearTimeoutSpy = vi.spyOn(globalThis, 'clearTimeout');
|
||||
|
||||
await withTimeout(() => Promise.resolve('success'), 5000);
|
||||
|
||||
expect(clearTimeoutSpy).toHaveBeenCalled();
|
||||
clearTimeoutSpy.mockRestore();
|
||||
});
|
||||
|
||||
it('should clear timeout timer when promise rejects', async () => {
|
||||
const clearTimeoutSpy = vi.spyOn(globalThis, 'clearTimeout');
|
||||
|
||||
await expect(withTimeout(() => Promise.reject(new Error('fail')), 5000)).rejects.toThrow(
|
||||
'fail',
|
||||
);
|
||||
|
||||
expect(clearTimeoutSpy).toHaveBeenCalled();
|
||||
clearTimeoutSpy.mockRestore();
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -17,3 +17,34 @@ export class TimeoutError extends Error {
|
|||
this.name = 'TimeoutError';
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if an error is a Node.js fetch network failure.
|
||||
* Node.js undici throws TypeError with message "fetch failed" on network errors.
|
||||
*/
|
||||
export const isFetchNetworkError = (error: unknown): boolean =>
|
||||
error instanceof TypeError && (error as Error).message === 'fetch failed';
|
||||
|
||||
/**
|
||||
* Normalize a fetch error into a typed error for consistent handling.
|
||||
* Converts network failures to `NetworkConnectionError`, passes through `TimeoutError`,
|
||||
* and returns any other error unchanged. Callers should `throw` the returned value.
|
||||
*
|
||||
* @example
|
||||
* ```ts
|
||||
* } catch (e) {
|
||||
* throw toFetchError(e);
|
||||
* }
|
||||
* ```
|
||||
*/
|
||||
export const toFetchError = (error: unknown): Error => {
|
||||
if (isFetchNetworkError(error)) {
|
||||
return new NetworkConnectionError();
|
||||
}
|
||||
|
||||
if (error instanceof TimeoutError) {
|
||||
return error;
|
||||
}
|
||||
|
||||
return error as Error;
|
||||
};
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
import { readFileSync } from 'node:fs';
|
||||
import * as path from 'node:path';
|
||||
import path from 'node:path';
|
||||
|
||||
import { describe, expect, it } from 'vitest';
|
||||
|
||||
|
|
@ -33,4 +33,29 @@ describe('htmlToMarkdown', () => {
|
|||
expect(data).toMatchSnapshot();
|
||||
}, 20000);
|
||||
});
|
||||
|
||||
it('should truncate HTML exceeding 1 MB', () => {
|
||||
// Create HTML slightly over 1 MB
|
||||
const maxSize = 1024 * 1024;
|
||||
const largeContent = 'x'.repeat(maxSize + 1000);
|
||||
const html = `<html><body><p>${largeContent}</p></body></html>`;
|
||||
|
||||
// Should not throw - the function handles large HTML by truncating
|
||||
const result = htmlToMarkdown(html, { url: 'https://example.com', filterOptions: {} });
|
||||
|
||||
// Verify content was produced (truncated HTML is still parseable)
|
||||
expect(result).toBeDefined();
|
||||
expect(result.content).toBeDefined();
|
||||
// The output content should be smaller than the input due to truncation
|
||||
expect(result.content.length).toBeLessThan(html.length);
|
||||
}, 20000);
|
||||
|
||||
it('should not truncate HTML under 1 MB', () => {
|
||||
const html = '<html><body><p>Small content</p></body></html>';
|
||||
|
||||
const result = htmlToMarkdown(html, { url: 'https://example.com', filterOptions: {} });
|
||||
|
||||
expect(result).toBeDefined();
|
||||
expect(result.content).toContain('Small content');
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -5,6 +5,9 @@ import { NodeHtmlMarkdown } from 'node-html-markdown';
|
|||
|
||||
import type { FilterOptions } from '../type';
|
||||
|
||||
/** Truncate HTML to 1 MB before DOM parsing to prevent CPU spikes on large pages */
|
||||
const MAX_HTML_SIZE = 1024 * 1024;
|
||||
|
||||
const cleanObj = <T extends object>(
|
||||
obj: T,
|
||||
): {
|
||||
|
|
@ -24,9 +27,10 @@ interface HtmlToMarkdownOutput {
|
|||
}
|
||||
|
||||
export const htmlToMarkdown = (
|
||||
html: string,
|
||||
rawHtml: string,
|
||||
{ url, filterOptions }: { filterOptions: FilterOptions; url: string },
|
||||
): HtmlToMarkdownOutput => {
|
||||
const html = rawHtml.length > MAX_HTML_SIZE ? rawHtml.slice(0, MAX_HTML_SIZE) : rawHtml;
|
||||
const window = new Window({ url });
|
||||
|
||||
const document = window.document;
|
||||
|
|
|
|||
49
packages/web-crawler/src/utils/response.ts
Normal file
49
packages/web-crawler/src/utils/response.ts
Normal file
|
|
@ -0,0 +1,49 @@
|
|||
const ERROR_BODY_SNIPPET_LIMIT = 200;
|
||||
|
||||
const normalizeBodySnippet = (body: string) => body.replaceAll(/\s+/g, ' ').trim();
|
||||
|
||||
export class ResponseBodyParseError extends Error {
|
||||
constructor(provider: string, bodySnippet?: string) {
|
||||
super(
|
||||
bodySnippet
|
||||
? `${provider} returned non-JSON response: ${bodySnippet}`
|
||||
: `${provider} returned non-JSON response`,
|
||||
);
|
||||
this.name = 'ResponseBodyParseError';
|
||||
}
|
||||
}
|
||||
|
||||
const getBodySnippet = async (response: Response): Promise<string | undefined> => {
|
||||
try {
|
||||
const body = await response.text();
|
||||
const snippet = normalizeBodySnippet(body).slice(0, ERROR_BODY_SNIPPET_LIMIT);
|
||||
|
||||
return snippet.length > 0 ? snippet : undefined;
|
||||
} catch {
|
||||
return undefined;
|
||||
}
|
||||
};
|
||||
|
||||
export const parseJSONResponse = async <T>(response: Response, provider: string): Promise<T> => {
|
||||
const clonedResponse = response.clone();
|
||||
|
||||
try {
|
||||
return (await response.json()) as T;
|
||||
} catch {
|
||||
const bodySnippet = await getBodySnippet(clonedResponse);
|
||||
throw new ResponseBodyParseError(provider, bodySnippet);
|
||||
}
|
||||
};
|
||||
|
||||
export const createHTTPStatusError = async (
|
||||
response: Response,
|
||||
provider: string,
|
||||
): Promise<Error> => {
|
||||
const bodySnippet = await getBodySnippet(response);
|
||||
|
||||
return new Error(
|
||||
bodySnippet
|
||||
? `${provider} request failed with status ${response.status}: ${response.statusText}. Response: ${bodySnippet}`
|
||||
: `${provider} request failed with status ${response.status}: ${response.statusText}`,
|
||||
);
|
||||
};
|
||||
|
|
@ -3,19 +3,28 @@ import { TimeoutError } from './errorType';
|
|||
export const DEFAULT_TIMEOUT = 10_000;
|
||||
|
||||
/**
|
||||
* Wraps a promise with a timeout
|
||||
* @param promise Promise to wrap
|
||||
* Wraps a factory function with a timeout and abort support.
|
||||
* The factory receives an AbortSignal that is aborted on timeout,
|
||||
* allowing the underlying request (e.g. fetch) to be properly cancelled.
|
||||
* @param fn Factory function that receives an AbortSignal and returns a Promise
|
||||
* @param ms Timeout in milliseconds
|
||||
* @returns Promise that will be rejected if it takes longer than ms to resolve
|
||||
*/
|
||||
export const withTimeout = <T>(promise: Promise<T>, ms: number = DEFAULT_TIMEOUT): Promise<T> => {
|
||||
export const withTimeout = <T>(
|
||||
fn: (signal: AbortSignal) => Promise<T>,
|
||||
ms: number = DEFAULT_TIMEOUT,
|
||||
): Promise<T> => {
|
||||
const controller = new AbortController();
|
||||
let timeoutId: ReturnType<typeof setTimeout>;
|
||||
|
||||
const timeoutPromise = new Promise<T>((_, reject) => {
|
||||
setTimeout(() => {
|
||||
timeoutId = setTimeout(() => {
|
||||
controller.abort();
|
||||
reject(new TimeoutError(`Request timeout after ${ms}ms`));
|
||||
}, ms);
|
||||
});
|
||||
|
||||
return Promise.race([promise, timeoutPromise]);
|
||||
return Promise.race([
|
||||
fn(controller.signal).finally(() => clearTimeout(timeoutId)),
|
||||
timeoutPromise,
|
||||
]);
|
||||
};
|
||||
|
|
|
|||
|
|
@ -1,15 +1,25 @@
|
|||
import { createEnv } from '@t3-oss/env-nextjs';
|
||||
import { z } from 'zod';
|
||||
|
||||
const optionalNumberEnv = (min: number, max: number) =>
|
||||
z.preprocess(
|
||||
(value) => (value === '' || value === null ? undefined : value),
|
||||
z.coerce.number().int().max(max).min(min).optional(),
|
||||
);
|
||||
|
||||
export const getToolsConfig = () => {
|
||||
return createEnv({
|
||||
runtimeEnv: {
|
||||
CRAWL_CONCURRENCY: process.env.CRAWL_CONCURRENCY,
|
||||
CRAWLER_RETRY: process.env.CRAWLER_RETRY,
|
||||
CRAWLER_IMPLS: process.env.CRAWLER_IMPLS,
|
||||
SEARCH_PROVIDERS: process.env.SEARCH_PROVIDERS,
|
||||
SEARXNG_URL: process.env.SEARXNG_URL,
|
||||
},
|
||||
|
||||
server: {
|
||||
CRAWL_CONCURRENCY: optionalNumberEnv(1, 10),
|
||||
CRAWLER_RETRY: optionalNumberEnv(0, 3),
|
||||
CRAWLER_IMPLS: z.string().optional(),
|
||||
SEARCH_PROVIDERS: z.string().optional(),
|
||||
SEARXNG_URL: z.string().url().optional(),
|
||||
|
|
|
|||
|
|
@ -46,6 +46,27 @@ describe('searchRouter', () => {
|
|||
expect(result.results[1]).toEqual({ content: 'test content' });
|
||||
});
|
||||
|
||||
it('should accept all supported crawler implementations', async () => {
|
||||
const caller = searchRouter.createCaller(mockContext as any);
|
||||
|
||||
const allImpls = [
|
||||
'browserless',
|
||||
'exa',
|
||||
'firecrawl',
|
||||
'jina',
|
||||
'naive',
|
||||
'search1api',
|
||||
'tavily',
|
||||
] as const;
|
||||
for (const impl of allImpls) {
|
||||
const result = await caller.crawlPages({
|
||||
urls: ['http://test.com'],
|
||||
impls: [impl],
|
||||
});
|
||||
expect(result.results).toHaveLength(1);
|
||||
}
|
||||
});
|
||||
|
||||
it('should work without specifying impls', async () => {
|
||||
const caller = searchRouter.createCaller(mockContext as any);
|
||||
|
||||
|
|
|
|||
|
|
@ -9,7 +9,10 @@ export const searchRouter = router({
|
|||
crawlPages: searchProcedure
|
||||
.input(
|
||||
z.object({
|
||||
impls: z.enum(['jina', 'naive', 'browserless']).array().optional(),
|
||||
impls: z
|
||||
.enum(['browserless', 'exa', 'firecrawl', 'jina', 'naive', 'search1api', 'tavily'])
|
||||
.array()
|
||||
.optional(),
|
||||
urls: z.string().array(),
|
||||
}),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ import { beforeEach, describe, expect, it, vi } from 'vitest';
|
|||
|
||||
import { toolsEnv } from '@/envs/tools';
|
||||
|
||||
import { createSearchServiceImpl,SearchImplType } from './impls';
|
||||
import { createSearchServiceImpl, SearchImplType } from './impls';
|
||||
import { SearchService } from './index';
|
||||
|
||||
// Mock dependencies
|
||||
|
|
@ -11,7 +11,9 @@ vi.mock('@lobechat/web-crawler');
|
|||
vi.mock('./impls');
|
||||
vi.mock('@/envs/tools', () => ({
|
||||
toolsEnv: {
|
||||
CRAWL_CONCURRENCY: undefined,
|
||||
CRAWLER_IMPLS: '',
|
||||
CRAWLER_RETRY: undefined,
|
||||
SEARCH_PROVIDERS: '',
|
||||
},
|
||||
}));
|
||||
|
|
@ -279,10 +281,9 @@ describe('SearchService', () => {
|
|||
describe('crawlPages', () => {
|
||||
it('should crawl multiple pages concurrently', async () => {
|
||||
const mockCrawlResult = {
|
||||
content: 'Page content',
|
||||
description: 'Page description',
|
||||
title: 'Page title',
|
||||
url: 'https://example.com',
|
||||
crawler: 'naive',
|
||||
data: { content: 'Page content', contentType: 'text' },
|
||||
originalUrl: 'https://example.com',
|
||||
};
|
||||
|
||||
const mockCrawler = {
|
||||
|
|
@ -304,8 +305,13 @@ describe('SearchService', () => {
|
|||
it('should use crawler implementations from env', async () => {
|
||||
vi.mocked(toolsEnv).CRAWLER_IMPLS = 'jina,reader';
|
||||
|
||||
const mockSuccessResult = {
|
||||
crawler: 'jina',
|
||||
data: { content: 'ok', contentType: 'text' },
|
||||
originalUrl: 'https://example.com',
|
||||
};
|
||||
const mockCrawler = {
|
||||
crawl: vi.fn().mockResolvedValue({}),
|
||||
crawl: vi.fn().mockResolvedValue(mockSuccessResult),
|
||||
};
|
||||
vi.mocked(Crawler).mockImplementation(() => mockCrawler as any);
|
||||
|
||||
|
|
@ -317,8 +323,13 @@ describe('SearchService', () => {
|
|||
});
|
||||
|
||||
it('should pass impls parameter to crawler.crawl', async () => {
|
||||
const mockSuccessResult = {
|
||||
crawler: 'jina',
|
||||
data: { content: 'ok', contentType: 'text' },
|
||||
originalUrl: 'https://example.com',
|
||||
};
|
||||
const mockCrawler = {
|
||||
crawl: vi.fn().mockResolvedValue({}),
|
||||
crawl: vi.fn().mockResolvedValue(mockSuccessResult),
|
||||
};
|
||||
vi.mocked(Crawler).mockImplementation(() => mockCrawler as any);
|
||||
|
||||
|
|
@ -334,5 +345,133 @@ describe('SearchService', () => {
|
|||
url: 'https://example.com',
|
||||
});
|
||||
});
|
||||
|
||||
it('should use CRAWL_CONCURRENCY from env', async () => {
|
||||
vi.mocked(toolsEnv).CRAWL_CONCURRENCY = 1;
|
||||
|
||||
const mockCrawler = {
|
||||
crawl: vi.fn().mockResolvedValue({
|
||||
crawler: 'naive',
|
||||
data: { content: 'ok', contentType: 'text' },
|
||||
originalUrl: 'https://example.com',
|
||||
}),
|
||||
};
|
||||
vi.mocked(Crawler).mockImplementation(() => mockCrawler as any);
|
||||
|
||||
searchService = new SearchService();
|
||||
const urls = ['https://a.com', 'https://b.com'];
|
||||
await searchService.crawlPages({ urls });
|
||||
|
||||
// All URLs should still be crawled
|
||||
expect(mockCrawler.crawl).toHaveBeenCalledTimes(2);
|
||||
});
|
||||
|
||||
it('should retry on failed crawl results', async () => {
|
||||
vi.mocked(toolsEnv).CRAWLER_RETRY = 1;
|
||||
|
||||
const failedResult = {
|
||||
crawler: 'naive',
|
||||
data: { content: 'Fail', errorType: 'NetworkError', errorMessage: 'timeout' },
|
||||
originalUrl: 'https://example.com',
|
||||
};
|
||||
const successResult = {
|
||||
crawler: 'naive',
|
||||
data: { content: 'Page content', contentType: 'text' },
|
||||
originalUrl: 'https://example.com',
|
||||
};
|
||||
|
||||
const mockCrawler = {
|
||||
crawl: vi.fn().mockResolvedValueOnce(failedResult).mockResolvedValueOnce(successResult),
|
||||
};
|
||||
vi.mocked(Crawler).mockImplementation(() => mockCrawler as any);
|
||||
|
||||
searchService = new SearchService();
|
||||
const result = await searchService.crawlPages({ urls: ['https://example.com'] });
|
||||
|
||||
expect(mockCrawler.crawl).toHaveBeenCalledTimes(2);
|
||||
expect(result.results[0]).toBe(successResult);
|
||||
});
|
||||
|
||||
it('should return last failed result after all retries exhausted', async () => {
|
||||
vi.mocked(toolsEnv).CRAWLER_RETRY = 1;
|
||||
|
||||
const failedResult = {
|
||||
crawler: 'naive',
|
||||
data: { content: 'Fail', errorType: 'NetworkError', errorMessage: 'timeout' },
|
||||
originalUrl: 'https://example.com',
|
||||
};
|
||||
|
||||
const mockCrawler = {
|
||||
crawl: vi.fn().mockResolvedValue(failedResult),
|
||||
};
|
||||
vi.mocked(Crawler).mockImplementation(() => mockCrawler as any);
|
||||
|
||||
searchService = new SearchService();
|
||||
const result = await searchService.crawlPages({ urls: ['https://example.com'] });
|
||||
|
||||
expect(mockCrawler.crawl).toHaveBeenCalledTimes(2); // 1 + 1 retry
|
||||
expect(result.results[0]).toBe(failedResult);
|
||||
});
|
||||
|
||||
it('should not retry when CRAWLER_RETRY is 0', async () => {
|
||||
vi.mocked(toolsEnv).CRAWLER_RETRY = 0;
|
||||
|
||||
const failedResult = {
|
||||
crawler: 'naive',
|
||||
data: { content: 'Fail', errorType: 'Error', errorMessage: 'fail' },
|
||||
originalUrl: 'https://example.com',
|
||||
};
|
||||
|
||||
const mockCrawler = {
|
||||
crawl: vi.fn().mockResolvedValue(failedResult),
|
||||
};
|
||||
vi.mocked(Crawler).mockImplementation(() => mockCrawler as any);
|
||||
|
||||
searchService = new SearchService();
|
||||
const result = await searchService.crawlPages({ urls: ['https://example.com'] });
|
||||
|
||||
expect(mockCrawler.crawl).toHaveBeenCalledTimes(1);
|
||||
expect(result.results[0]).toBe(failedResult);
|
||||
});
|
||||
|
||||
it('should handle crawl exceptions during retry', async () => {
|
||||
vi.mocked(toolsEnv).CRAWLER_RETRY = 1;
|
||||
|
||||
const mockCrawler = {
|
||||
crawl: vi.fn().mockRejectedValue(new Error('Network error')),
|
||||
};
|
||||
vi.mocked(Crawler).mockImplementation(() => mockCrawler as any);
|
||||
|
||||
searchService = new SearchService();
|
||||
const result = await searchService.crawlPages({ urls: ['https://example.com'] });
|
||||
|
||||
expect(mockCrawler.crawl).toHaveBeenCalledTimes(2);
|
||||
expect(result.results[0].data).toMatchObject({
|
||||
errorType: 'Error',
|
||||
errorMessage: 'Network error',
|
||||
});
|
||||
});
|
||||
|
||||
it('should detect successful results by contentType presence', async () => {
|
||||
vi.mocked(toolsEnv).CRAWLER_RETRY = 1;
|
||||
|
||||
const successResult = {
|
||||
crawler: 'naive',
|
||||
data: { content: 'Page content', contentType: 'text' },
|
||||
originalUrl: 'https://example.com',
|
||||
};
|
||||
|
||||
const mockCrawler = {
|
||||
crawl: vi.fn().mockResolvedValue(successResult),
|
||||
};
|
||||
vi.mocked(Crawler).mockImplementation(() => mockCrawler as any);
|
||||
|
||||
searchService = new SearchService();
|
||||
const result = await searchService.crawlPages({ urls: ['https://example.com'] });
|
||||
|
||||
// Should not retry since result has contentType (successful)
|
||||
expect(mockCrawler.crawl).toHaveBeenCalledTimes(1);
|
||||
expect(result.results[0]).toBe(successResult);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
import { type SearchParams, type SearchQuery } from '@lobechat/types';
|
||||
import { type CrawlImplType } from '@lobechat/web-crawler';
|
||||
import type { SearchParams, SearchQuery } from '@lobechat/types';
|
||||
import type { Crawler, CrawlImplType, CrawlUniformResult } from '@lobechat/web-crawler';
|
||||
import pMap from 'p-map';
|
||||
|
||||
import { toolsEnv } from '@/envs/tools';
|
||||
|
|
@ -7,6 +7,9 @@ import { toolsEnv } from '@/envs/tools';
|
|||
import { type SearchImplType, type SearchServiceImpl } from './impls';
|
||||
import { createSearchServiceImpl } from './impls';
|
||||
|
||||
const DEFAULT_CRAWL_CONCURRENCY = 3;
|
||||
const DEFAULT_CRAWLER_RETRY = 1;
|
||||
|
||||
const parseImplEnv = (envString: string = '') => {
|
||||
// Handle full-width commas and extra whitespace
|
||||
const envValue = envString.replaceAll(',', ',').trim();
|
||||
|
|
@ -24,6 +27,14 @@ export class SearchService {
|
|||
return parseImplEnv(toolsEnv.CRAWLER_IMPLS);
|
||||
}
|
||||
|
||||
private get crawlConcurrency() {
|
||||
return toolsEnv.CRAWL_CONCURRENCY ?? DEFAULT_CRAWL_CONCURRENCY;
|
||||
}
|
||||
|
||||
private get crawlerRetry() {
|
||||
return toolsEnv.CRAWLER_RETRY ?? DEFAULT_CRAWLER_RETRY;
|
||||
}
|
||||
|
||||
constructor() {
|
||||
const impls = this.searchImpls;
|
||||
// TODO: need use turn mode
|
||||
|
|
@ -37,14 +48,59 @@ export class SearchService {
|
|||
const results = await pMap(
|
||||
input.urls,
|
||||
async (url) => {
|
||||
return await crawler.crawl({ impls: input.impls, url });
|
||||
return await this.crawlWithRetry(crawler, url, input.impls);
|
||||
},
|
||||
{ concurrency: 3 },
|
||||
{ concurrency: this.crawlConcurrency },
|
||||
);
|
||||
|
||||
return { results };
|
||||
}
|
||||
|
||||
private async crawlWithRetry(
|
||||
crawler: Crawler,
|
||||
url: string,
|
||||
impls?: CrawlImplType[],
|
||||
): Promise<CrawlUniformResult> {
|
||||
const maxAttempts = this.crawlerRetry + 1;
|
||||
let lastResult: CrawlUniformResult | undefined;
|
||||
let lastError: Error | undefined;
|
||||
|
||||
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
|
||||
try {
|
||||
const result = await crawler.crawl({ impls, url });
|
||||
lastResult = result;
|
||||
|
||||
if (!this.isFailedCrawlResult(result)) {
|
||||
return result;
|
||||
}
|
||||
} catch (error) {
|
||||
lastError = error as Error;
|
||||
}
|
||||
}
|
||||
|
||||
if (lastResult) {
|
||||
return lastResult;
|
||||
}
|
||||
|
||||
return {
|
||||
crawler: 'unknown',
|
||||
data: {
|
||||
content: `Fail to crawl the page. Error type: ${lastError?.name || 'UnknownError'}, error message: ${lastError?.message}`,
|
||||
errorMessage: lastError?.message,
|
||||
errorType: lastError?.name || 'UnknownError',
|
||||
},
|
||||
originalUrl: url,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* A successful crawl result always includes `contentType` (e.g. 'text', 'json')
|
||||
* in `result.data`, while a failed result contains `errorType`/`errorMessage` instead.
|
||||
*/
|
||||
private isFailedCrawlResult(result: CrawlUniformResult): boolean {
|
||||
return !('contentType' in result.data);
|
||||
}
|
||||
|
||||
private get searchImpls() {
|
||||
return parseImplEnv(toolsEnv.SEARCH_PROVIDERS) as SearchImplType[];
|
||||
}
|
||||
|
|
@ -58,17 +114,17 @@ export class SearchService {
|
|||
|
||||
async webSearch({ query, searchCategories, searchEngines, searchTimeRange }: SearchQuery) {
|
||||
let data = await this.query(query, {
|
||||
searchCategories: searchCategories,
|
||||
searchEngines: searchEngines,
|
||||
searchTimeRange: searchTimeRange,
|
||||
searchCategories,
|
||||
searchEngines,
|
||||
searchTimeRange,
|
||||
});
|
||||
|
||||
// First retry: remove search engine restrictions if no results found
|
||||
if (data.results.length === 0 && searchEngines && searchEngines?.length > 0) {
|
||||
const paramsExcludeSearchEngines = {
|
||||
searchCategories: searchCategories,
|
||||
searchCategories,
|
||||
searchEngines: undefined,
|
||||
searchTimeRange: searchTimeRange,
|
||||
searchTimeRange,
|
||||
};
|
||||
data = await this.query(query, paramsExcludeSearchEngines);
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue