Web scraper that scraping web pages by LetsScrapeData XML template
npm install @letsscrapedata/scrapersh
npm install @letsscrapedata/scraper
`
Examples
1. Example with default ScraperConfig:
`javascript
// javascript
import { scraper } from "@letsscrapedata/sraper";
/**
* tid: ID of template to be executed, such as template for scraping one list of example in page "https://www.letsscrapedata.com/pages/listexample1.html"
* parasstrs: input parameters of tasks, such as "1"
* this example will execute five tasks using template 10001, each of them scrapes the data in one page.
*/
const newTasks = [{ tid: 10001, parasstrs: ["1", "2", "3", "4", "5"] }];
/ The following line can do the same thing using subtasks, scraping the data in the first five pages /
// const newTasks = [{ tid: 10002, parasstrs: ["5"] }];
await scraper(newTasks);
`
2. Example with ScraperConfig
`typescript
// typescript
import { scraper, TemplateTasks, ScraperConfig } from "@letsscrapedata/sraper";
const scraperConfig: ScraperConfig = {
browserConfigs: [
/ launch a chromium browser using puppeteer, no proxy /
{ browserControllerType: "puppeteer", proxyUrl: "" },
/ launch a chromium browser using playwright, proxy /
{ browserContollerType: "playwright", proxyUrl: "http://proxyId:port" },
/ connect to the current browser using patchright /
{ browserUrl: "http://localhost:9222/" },
],
// exitWhenCompleted: true,
// lsdLaunchOptions: { headless: true },
// loadUnfinishedTasks: true,
// loadFailedTasksInterval: 5
// captcha: { clientKey: "xxx" } // to solve captcha using 2captca
};
const newTasks: TemplateTasks[] = [{ tid: 10002, parasstrs: ["9"] }];
await scraper(newTasks, scraperConfig);
`
ScraperConfig
Common configurations:
- Proxies and browser: browserConfigs, by default launching a browser using browserControllerType/browserType, without proxy
- Launch options of browser: lsdLaunchOptions, default {headless: false}
- Whether to load unfinished tasks: loadUnfinishedTasks, default false
- Whether to exist when completed: exitWhenCompleted, default false
- File format of scraped data: dataFileFormat, default "jsonl"
- API Key of captcha solver: captcha.clientKey
Complete configurations:
`typescript
export interface ScraperConfig {
/**
* @default false
*/
exitWhenCompleted?: boolean;
/**
* whether to use the parasstr in XML if parasstr of a task is ""
* @default false
*/
useParasstrInXmlIfNeeded?: boolean;
/**
* whether to load unfinished tasks
* @default false
*/
loadUnfinishedTasks?: boolean;
//////////////////////////////////////////////////////////////////////////// directory
/**
* @default "", which will use current directory of process + "/data/"
* if not empty, baseDir must be an absolute path, and the directory must exist and have read and write permissions.
*/
baseDir?: string;
/**
* filename in action_setvar_get/get_file must include inputFileDirePart for security.
* @default "LetsScrapeData"
*/
inputFileDirPart?: string;
//////////////////////////////////////////////////////////////////////////// browser
/**
* wether to use puppeteer-extra-plugin-stealth, use patchright instead
* @default false
*/
useStealthPlugin?: boolean;
/**
* default browserControllerType of BrowserConfig
* @default "patchright"
*/
browserControllerType?: BrowserControllerType;
/**
* default browserType of BrowserConfig
* @default "chromium"
*/
browserType?: LsdBrowserType;
/**
* @default { headless: false, geoip: true }
*/
lsdLaunchOptions?: LsdLaunchOptions;
/**
* @default {browserUrl: ""}
*/
lsdConnectOptions?: LsdConnectOptions;
/**
* Important: browsers to be launched or connected using proxyUrl
* @default [{proxyUrl: ""}], launch a default browser using default type of browser controller, no proxy
*/
browserConfigs?: BrowserConfig[];
//////////////////////////////////////////////////////////////////////////// captcha
captcha?: {
/**
* clientKey of 2captcha
*/
clientKey: string;
// if you need to solve captcha in camoufox, please contact administrator
},
//////////////////////////////////////////////////////////////////////////// template
/**
* the default maximum number of concurrent tasks that can execute the same template in a browserContext
* @default 1
*/
maxConcurrency?: number;
/**
* @default ""
*/
readCode?: string;
/**
* @default []
*/
templateParas?: TemplatePara[];
//////////////////////////////////////////////////////////////////////////// scheduler
/**
* @default 10
*/
totalMaxConcurrency?: number;
/**
* min miliseconds between two tasks of the same template
* @default 2000
*/
minMiliseconds?: number,
//////////////////////////////////////////////////////////////////////////// data
/**
whether to move all dat_ files into a new directory "yyyyMMddHHmmss"
* @default false
*/
moveDataWhenStart?: boolean;
/**
** DataFileFormat = "csv" | "jsonl" | "tsv" | "txt";
* @default "jsonl"
*/
dataFileFormat?: DataFileFormat;
* valid only when dataFileFormat is "txt"
*/
columnSeperator?: string;
}
/**
* Only one of browserUrl and proxyUrl will take effect, and browserUrl has higher priority.
*/
export interface BrowserConfig {
browserControllerType?: BrowserControllerType;
/**
* url used to connected the current browser
** url starts with "http://", such as "http://localhost:9222/"
** browserUrl can be used when mannaul login in advance.
*/
browserUrl?: string;
/**
* proxy
** no proxy will be used if proxyUrl is ""
** valid only if !browserUrl
*/
proxyUrl?: string;
/**
* type of browser to be launched
* valid only if !browserUrl
* @default "chromium"
*/
browserType?: LsdBrowserType;
}
``