Browser API 快速开始示例
运行这些基本示例以检查您的 Browser API 是否正常工作(请记得替换为您的凭证和目标 URL):- NodeJS
- Python
- C#
复制
#!/usr/bin/env node
const puppeteer = require('puppeteer-core');
const {
// Replace with your Browser API zone credentials
AUTH = 'USER:PASS',
TARGET_URL = 'https://example.com',
} = process.env;
async function scrape(url = TARGET_URL) {
if (AUTH == 'USER:PASS') {
throw new Error(`Provide Browser API credentials in AUTH`
+ ` environment variable or update the script.`);
}
console.log(`Connecting to Browser...`);
const browserWSEndpoint = `wss://${AUTH}@brd.superproxy.io:9222`;
const browser = await puppeteer.connect({ browserWSEndpoint });
try {
console.log(`Connected! Navigating to ${url}...`);
const page = await browser.newPage();
const client = await page.createCDPSession();
const { frameTree: { frame } } = await client.send('Page.getFrameTree');
const { url: inspectUrl } = await client.send('Page.inspect', {
frameId: frame.id,
});
console.log(`You can inspect this session at: ${inspectUrl}.`);
await page.goto(url, { timeout: 2 * 60 * 1000 });
console.log(`Navigated! Scraping page content...`);
const data = await page.content();
console.log(`Scraped! Data: ${data}`);
} finally {
await browser.close();
}
}
function getErrorDetails(error) {
if (error.target?._req?.res) {
const {
statusCode,
statusMessage,
} = error.target._req.res;
return `Unexpected Server Status ${statusCode}: ${statusMessage}`;
}
}
if (require.main == module) {
scrape().catch(error => {
console.error(getErrorDetails(error)
|| error.stack
|| error.message
|| error);
process.exit(1);
});
}
复制
#!/usr/bin/env python3
import asyncio
from os import environ
from playwright.async_api import Playwright, async_playwright
# Replace with your Browser API zone credentials
AUTH = environ.get('AUTH', default='USER:PASS')
TARGET_URL = environ.get('TARGET_URL', default='https://example.com')
async def scrape(playwright: Playwright, url=TARGET_URL):
if AUTH == 'USER:PASS':
raise Exception('Provide Browser API credentials in AUTH '
'environment variable or update the script.')
print('Connecting to Browser...')
endpoint_url = f'wss://{AUTH}@brd.superproxy.io:9222'
browser = await playwright.chromium.connect_over_cdp(endpoint_url)
try:
print(f'Connected! Navigating to {url}...')
page = await browser.new_page()
client = await page.context.new_cdp_session(page)
frames = await client.send('Page.getFrameTree')
frame_id = frames['frameTree']['frame']['id']
inspect = await client.send('Page.inspect', {
'frameId': frame_id,
})
inspect_url = inspect['url']
print(f'You can inspect this session at: {inspect_url}.')
await page.goto(url, timeout=2*60_000)
print('Navigated! Scraping page content...')
data = await page.content()
print(f'Scraped! Data: {data}')
finally:
await browser.close()
async def main():
async with async_playwright() as playwright:
await scrape(playwright)
if __name__ == '__main__':
asyncio.run(main())
复制
using PuppeteerSharp;
using System.Net.WebSockets;
using System.Text;
class Scraper
{
private string _auth;
public Scraper(string auth)
{
_auth = auth;
}
private async Task<IBrowser> Connect()
{
if (_auth == "USER:PASS")
{
throw new Exception("Provide Browser API credentials in AUTH"
+ " environment variable or update the script.");
}
var options = new ConnectOptions()
{
BrowserWSEndpoint = "wss://brd.superproxy.io:9222",
WebSocketFactory = async (uri, options, cToken) =>
{
var socket = new ClientWebSocket();
var authBytes = Encoding.UTF8.GetBytes(_auth);
var authHeader = "Basic " + Convert.ToBase64String(authBytes);
socket.Options.SetRequestHeader("Authorization", authHeader);
socket.Options.KeepAliveInterval = TimeSpan.Zero;
await socket.ConnectAsync(uri, cToken);
return socket;
},
};
return await Puppeteer.ConnectAsync(options);
}
public async Task Scrape(string url)
{
Console.WriteLine("Connecting to Browser...");
var browser = await Connect();
try {
Console.WriteLine($"Connected! Navigating to {url}...");
var page = await browser.NewPageAsync();
var client = await page.Target.CreateCDPSessionAsync();
var frames = await client.SendAsync("Page.getFrameTree");
var frameId = frames!.Value.GetProperty("frameTree").GetProperty("frame")
.GetProperty("id").GetString();
var parameters = new Dictionary<string, object> { { "frameId", frameId } };
var inspect = await client.SendAsync("Page.inspect", parameters);
var inspectUrl = inspect!.Value.GetProperty("url").GetString();
Console.WriteLine($"You can inspect this session at: {inspectUrl}");
await page.GoToAsync(url, /* timeout= */ 2 * 60 * 1000);
Console.WriteLine("Navigated! Scraping page content...");
var data = await page.GetContentAsync();
Console.WriteLine($"Scraped! Data: {data}");
} finally {
await browser.CloseAsync();
}
}
private static string Env(string name, string defaultValue)
{
return Environment.GetEnvironmentVariable(name) ?? defaultValue;
}
public static async Task Main()
{
// Replace with your Browser API zone credentials
var auth = Env("AUTH", "USER:PASS");
var url = Env("TARGET_URL", "https://example.com");
var scraper = new Scraper(auth);
await scraper.Scrape(url);
}
}
每个会话仅支持一次初始导航
Browser API 会话的结构允许每个会话进行一次初始导航。初始导航是指首次加载需采集数据的目标站点。之后,用户可以在同一会话中通过点击、滚动等交互方式自由导航网站。但如果您想开始一个新的采集任务——无论是相同网站还是不同网站——并重新从初始导航步骤开始,就必须启动一个新的会话。会话时间限制
Browser API 提供两种超时机制,用于保护用户避免失控或异常使用:- 空闲会话超时:如果浏览器会话空闲超过 5 分钟(没有任何活动流经它),Browser API 将自动结束该会话。
- 最大会话时长超时:Browser API 会话最长可持续 30 分钟。一旦达到最大时长,会话将自动结束。