Crawl

Start Crawl Job

Starts a crawl job for a given URL.

Method: client.crawl.start(params: StartCrawlJobParams): Promise<StartCrawlJobResponse>

Endpoint: POST /api/crawl

Parameters:

StartCrawlJobParams:
- url: string - URL to scrape
- maxPages?: number - Max number of pages to crawl
- followLinks?: boolean - Follow links on the page
- ignoreSitemap?: boolean - Ignore sitemap when finding links to crawl
- excludePatterns?: string[] - Patterns for paths to exclude from crawl
- includePatterns?: string[] - Patterns for paths to include in the crawl
- sessionOptions?:
- scrapeOptions?:

Response:

Example:

const response = await client.crawl.start({
  url: "https://example.com",
});
console.log(response.jobId);

Retrieves details of a specific crawl job.

Method: client.crawl.get(id: string): Promise<CrawlJobResponse>

Endpoint: GET /api/crawl/{id}

Parameters:

id: string - Crawl job ID

Example:

const response = await client.crawl.get(
  "182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e"
);
console.log(response.status);

Start Crawl Job and Wait

Start a crawl job and wait for it to complete

Method: client.crawl.startAndWait(params: StartCrawlJobParams, returnAllPages: boolean = true): Promise<CrawlJobResponse>

Parameters:

StartCrawlJobParams:
- url: string - URL to scrape
- maxPages?: number - Max number of pages to crawl
- followLinks?: boolean - Follow links on the page
- ignoreSitemap?: boolean - Ignore sitemap when finding links to crawl
- excludePatterns?: string[] - Patterns for paths to exclude from crawl
- includePatterns?: string[] - Patterns for paths to include in the crawl
returnAllPages: boolean - Return all pages in the crawl job response

Example:

const response = await client.crawl.startAndWait({
  url: "https://example.com"
});
console.log(response.status);

Types

CrawlPageStatus

type CrawlPageStatus = "completed" | "failed";

CrawlJobStatus

type CrawlJobStatus = "pending" | "running" | "completed" | "failed";

StartCrawlJobResponse

interface StartCrawlJobResponse {
  jobId: string;
}

CrawledPage

interface CrawledPage {
  url: string;
  status: CrawlPageStatus;
  error?: string | null;
  metadata?: Record<string, string | string[]>;
  markdown?: string;
  html?: string;
  links?: string[];
}

CrawlJobResponse

interface CrawlJobResponse {
  jobId: string;
  status: CrawlJobStatus;
  data?: CrawledPage[];
  error?: string;
  totalCrawledPages: number;
  totalPageBatches: number;
  currentPageBatch: number;
  batchSize: number;
}

PreviousScrape NextExtensions

Last updated 5 months ago

Types

CrawlPageStatus

type CrawlPageStatus = "completed" | "failed";

CrawlJobStatus

type CrawlJobStatus = "pending" | "running" | "completed" | "failed";

StartCrawlJobResponse

interface StartCrawlJobResponse {
  jobId: string;
}

CrawledPage

interface CrawledPage {
  url: string;
  status: CrawlPageStatus;
  error?: string | null;
  metadata?: Record<string, string | string[]>;
  markdown?: string;
  html?: string;
  links?: string[];
}

CrawlJobResponse

interface CrawlJobResponse {
  jobId: string;
  status: CrawlJobStatus;
  data?: CrawledPage[];
  error?: string;
  totalCrawledPages: number;
  totalPageBatches: number;
  currentPageBatch: number;
  batchSize: number;
}

Crawl

Start Crawl Job

Start Crawl Job and Wait

Types

CrawlPageStatus

CrawlJobStatus

StartCrawlJobResponse

CrawledPage

CrawlJobResponse

Start Crawl Job

Get Crawl Job

Start Crawl Job and Wait

Types

CrawlPageStatus

CrawlJobStatus

StartCrawlJobResponse

CrawledPage

CrawlJobResponse

Get Crawl Job