抓取浏览器
Scraping Browser 代码示例
探索将 Bright Data 的 Scraping Browser 与 Playwright、Puppeteer 和 Selenium 等各种技术结合使用的详细代码示例。
以下是 Scraping Browser 在不同场景和库中的使用示例。
请确保在继续之前安装所需的库
目标页面的简单抓取
选择您喜欢的技术组合
#!/usr/bin/env node
const playwright = require('playwright');
const {
AUTH = 'USER:PASS',
TARGET_URL = 'https://example.com',
} = process.env;
async function scrape(url = TARGET_URL) {
if (AUTH == 'USER:PASS') {
throw new Error(`Provide Scraping Browsers credentials in AUTH`
+ ` environment variable or update the script.`);
}
console.log(`Connecting to Browser...`);
const endpointURL = `wss://${AUTH}@brd.superproxy.io:9222`;
const browser = await playwright.chromium.connectOverCDP(endpointURL);
try {
console.log(`Connected! Navigating to ${url}...`);
const page = await browser.newPage();
await page.goto(url, { timeout: 2 * 60 * 1000 });
console.log(`Navigated! Scraping page content...`);
const data = await page.content();
console.log(`Scraped! Data: ${data}`);
} finally {
await browser.close();
}
}
if (require.main == module) {
scrape().catch(error => {
console.error(error.stack || error.message || error);
process.exit(1);
});
}
#!/usr/bin/env node
const playwright = require('playwright');
const {
AUTH = 'USER:PASS',
TARGET_URL = 'https://example.com',
} = process.env;
async function scrape(url = TARGET_URL) {
if (AUTH == 'USER:PASS') {
throw new Error(`Provide Scraping Browsers credentials in AUTH`
+ ` environment variable or update the script.`);
}
console.log(`Connecting to Browser...`);
const endpointURL = `wss://${AUTH}@brd.superproxy.io:9222`;
const browser = await playwright.chromium.connectOverCDP(endpointURL);
try {
console.log(`Connected! Navigating to ${url}...`);
const page = await browser.newPage();
await page.goto(url, { timeout: 2 * 60 * 1000 });
console.log(`Navigated! Scraping page content...`);
const data = await page.content();
console.log(`Scraped! Data: ${data}`);
} finally {
await browser.close();
}
}
if (require.main == module) {
scrape().catch(error => {
console.error(error.stack || error.message || error);
process.exit(1);
});
}
#!/usr/bin/env python3
import asyncio
from os import environ
from playwright.async_api import Playwright, async_playwright
AUTH = environ.get('AUTH', default='USER:PASS')
TARGET_URL = environ.get('TARGET_URL', default='https://example.com')
async def scrape(playwright: Playwright, url=TARGET_URL):
if AUTH == 'USER:PASS':
raise Exception('Provide Scraping Browsers credentials in AUTH ' +
'environment variable or update the script.')
print('Connecting to Browser...')
endpoint_url = f'wss://{AUTH}@brd.superproxy.io:9222'
browser = await playwright.chromium.connect_over_cdp(endpoint_url)
try:
print(f'Connected! Navigating to {url}...')
page = await browser.new_page()
await page.goto(url, timeout=2*60_000)
print('Navigated! Scraping page content...')
data = await page.content()
print(f'Scraped! Data: {data}')
finally:
await browser.close()
async def main():
async with async_playwright() as playwright:
await scrape(playwright)
if __name__ == '__main__':
asyncio.run(main())
using PuppeteerSharp;
using System.Net.WebSockets;
using System.Text;
class Scraper
{
private string _auth;
public Scraper(string auth)
{
_auth = auth;
}
private async Task<IBrowser> Connect()
{
if (_auth == "USER:PASS")
{
throw new Exception("Provide Scraping Browsers credentials in AUTH"
+ " environment variable or update the script.");
}
var options = new ConnectOptions()
{
BrowserWSEndpoint = "wss://brd.superproxy.io:9222",
WebSocketFactory = async (uri, options, cToken) =>
{
var socket = new ClientWebSocket();
var authBytes = Encoding.UTF8.GetBytes(_auth);
var authHeader = "基础 " + Convert.ToBase64String(authBytes);
socket.Options.SetRequestHeader("Authorization", authHeader);
socket.Options.KeepAliveInterval = TimeSpan.Zero;
await socket.ConnectAsync(uri, cToken);
return socket;
},
};
return await Puppeteer.ConnectAsync(options);
}
public async Task Scrape(string url)
{
Log("Connecting to Browser...");
var browser = await Connect();
try {
Log($"Connected! Navigating to {url}...");
var page = await browser.NewPageAsync();
await page.GoToAsync(url, /* timeout= */ 2 * 60 * 1000);
Log("Navigated! Scraping page content...");
var data = await page.GetContentAsync();
Log($"Scraped! Data: {data}");
} finally {
await browser.CloseAsync();
}
}
private static string Env(string name, string defaultValue)
{
return Environment.GetEnvironmentVariable(name) ?? defaultValue;
}
private static void Log(string message)
{
Console.WriteLine(message);
}
public static async Task Main()
{
var auth = Env("AUTH", "USER:PASS");
var url = Env("TARGET_URL", "https://example.com");
var scraper = new Scraper(auth);
await scraper.Scrape(url);
}
}
目标页面的简单抓取
选择您喜欢的技术组合
#!/usr/bin/env node
const playwright = require('playwright');
const {
AUTH = 'USER:PASS',
TARGET_URL = 'https://example.com',
} = process.env;
async function scrape(url = TARGET_URL) {
if (AUTH == 'USER:PASS') {
throw new Error(`Provide Scraping Browsers credentials in AUTH`
+ ` environment variable or update the script.`);
}
console.log(`Connecting to Browser...`);
const endpointURL = `wss://${AUTH}@brd.superproxy.io:9222`;
const browser = await playwright.chromium.connectOverCDP(endpointURL);
try {
console.log(`Connected! Navigating to ${url}...`);
const page = await browser.newPage();
await page.goto(url, { timeout: 2 * 60 * 1000 });
console.log(`Navigated! Scraping page content...`);
const data = await page.content();
console.log(`Scraped! Data: ${data}`);
} finally {
await browser.close();
}
}
if (require.main == module) {
scrape().catch(error => {
console.error(error.stack || error.message || error);
process.exit(1);
});
}
#!/usr/bin/env node
const playwright = require('playwright');
const {
AUTH = 'USER:PASS',
TARGET_URL = 'https://example.com',
} = process.env;
async function scrape(url = TARGET_URL) {
if (AUTH == 'USER:PASS') {
throw new Error(`Provide Scraping Browsers credentials in AUTH`
+ ` environment variable or update the script.`);
}
console.log(`Connecting to Browser...`);
const endpointURL = `wss://${AUTH}@brd.superproxy.io:9222`;
const browser = await playwright.chromium.connectOverCDP(endpointURL);
try {
console.log(`Connected! Navigating to ${url}...`);
const page = await browser.newPage();
await page.goto(url, { timeout: 2 * 60 * 1000 });
console.log(`Navigated! Scraping page content...`);
const data = await page.content();
console.log(`Scraped! Data: ${data}`);
} finally {
await browser.close();
}
}
if (require.main == module) {
scrape().catch(error => {
console.error(error.stack || error.message || error);
process.exit(1);
});
}
#!/usr/bin/env python3
import asyncio
from os import environ
from playwright.async_api import Playwright, async_playwright
AUTH = environ.get('AUTH', default='USER:PASS')
TARGET_URL = environ.get('TARGET_URL', default='https://example.com')
async def scrape(playwright: Playwright, url=TARGET_URL):
if AUTH == 'USER:PASS':
raise Exception('Provide Scraping Browsers credentials in AUTH ' +
'environment variable or update the script.')
print('Connecting to Browser...')
endpoint_url = f'wss://{AUTH}@brd.superproxy.io:9222'
browser = await playwright.chromium.connect_over_cdp(endpoint_url)
try:
print(f'Connected! Navigating to {url}...')
page = await browser.new_page()
await page.goto(url, timeout=2*60_000)
print('Navigated! Scraping page content...')
data = await page.content()
print(f'Scraped! Data: {data}')
finally:
await browser.close()
async def main():
async with async_playwright() as playwright:
await scrape(playwright)
if __name__ == '__main__':
asyncio.run(main())
using PuppeteerSharp;
using System.Net.WebSockets;
using System.Text;
class Scraper
{
private string _auth;
public Scraper(string auth)
{
_auth = auth;
}
private async Task<IBrowser> Connect()
{
if (_auth == "USER:PASS")
{
throw new Exception("Provide Scraping Browsers credentials in AUTH"
+ " environment variable or update the script.");
}
var options = new ConnectOptions()
{
BrowserWSEndpoint = "wss://brd.superproxy.io:9222",
WebSocketFactory = async (uri, options, cToken) =>
{
var socket = new ClientWebSocket();
var authBytes = Encoding.UTF8.GetBytes(_auth);
var authHeader = "基础 " + Convert.ToBase64String(authBytes);
socket.Options.SetRequestHeader("Authorization", authHeader);
socket.Options.KeepAliveInterval = TimeSpan.Zero;
await socket.ConnectAsync(uri, cToken);
return socket;
},
};
return await Puppeteer.ConnectAsync(options);
}
public async Task Scrape(string url)
{
Log("Connecting to Browser...");
var browser = await Connect();
try {
Log($"Connected! Navigating to {url}...");
var page = await browser.NewPageAsync();
await page.GoToAsync(url, /* timeout= */ 2 * 60 * 1000);
Log("Navigated! Scraping page content...");
var data = await page.GetContentAsync();
Log($"Scraped! Data: {data}");
} finally {
await browser.CloseAsync();
}
}
private static string Env(string name, string defaultValue)
{
return Environment.GetEnvironmentVariable(name) ?? defaultValue;
}
private static void Log(string message)
{
Console.WriteLine(message);
}
public static async Task Main()
{
var auth = Env("AUTH", "USER:PASS");
var url = Env("TARGET_URL", "https://example.com");
var scraper = new Scraper(auth);
await scraper.Scrape(url);
}
}
Open a page and wait for captcha to solve
选择您喜欢的技术组合
#!/usr/bin/env node
const playwright = require('playwright');
const {
AUTH = 'USER:PASS',
TARGET_URL = 'https://example.com',
} = process.env;
async function scrape(url = TARGET_URL) {
if (AUTH == 'USER:PASS') {
throw new Error(`Provide Scraping Browsers credentials in AUTH`
+ ` environment variable or update the script.`);
}
console.log(`Connecting to Browser...`);
const endpointURL = `wss://${AUTH}@brd.superproxy.io:9222`;
const browser = await playwright.chromium.connectOverCDP(endpointURL);
try {
console.log(`Connected! Navigating to ${url}...`);
const page = await browser.newPage();
const client = await page.context().newCDPSession(page);
await page.goto(url, { timeout: 2 * 60 * 1000 });
console.log(`Navigated! Waiting captcha to detect and solve...`);
const { status } = await client.send('验证码.waitForSolve', {
detectTimeout: 10 * 1000,
});
console.log(`验证码 status: ${status}`);
} finally {
await browser.close();
}
}
if (require.main == module) {
scrape().catch(error => {
console.error(error.stack || error.message || error);
process.exit(1);
});
}
#!/usr/bin/env node
const playwright = require('playwright');
const {
AUTH = 'USER:PASS',
TARGET_URL = 'https://example.com',
} = process.env;
async function scrape(url = TARGET_URL) {
if (AUTH == 'USER:PASS') {
throw new Error(`Provide Scraping Browsers credentials in AUTH`
+ ` environment variable or update the script.`);
}
console.log(`Connecting to Browser...`);
const endpointURL = `wss://${AUTH}@brd.superproxy.io:9222`;
const browser = await playwright.chromium.connectOverCDP(endpointURL);
try {
console.log(`Connected! Navigating to ${url}...`);
const page = await browser.newPage();
const client = await page.context().newCDPSession(page);
await page.goto(url, { timeout: 2 * 60 * 1000 });
console.log(`Navigated! Waiting captcha to detect and solve...`);
const { status } = await client.send('验证码.waitForSolve', {
detectTimeout: 10 * 1000,
});
console.log(`验证码 status: ${status}`);
} finally {
await browser.close();
}
}
if (require.main == module) {
scrape().catch(error => {
console.error(error.stack || error.message || error);
process.exit(1);
});
}
#!/usr/bin/env python3
import asyncio
from os import environ
from playwright.async_api import Playwright, async_playwright
AUTH = environ.get('AUTH', default='USER:PASS')
TARGET_URL = environ.get('TARGET_URL', default='https://example.com')
async def scrape(playwright: Playwright, url=TARGET_URL):
if AUTH == 'USER:PASS':
raise Exception('Provide Scraping Browsers credentials in AUTH ' +
'environment variable or update the script.')
print('Connecting to Browser...')
endpoint_url = f'wss://{AUTH}@brd.superproxy.io:9222'
browser = await playwright.chromium.connect_over_cdp(endpoint_url)
try:
print(f'Connected! Navigating to {url}...')
page = await browser.new_page()
client = await page.context.new_cdp_session(page)
await page.goto(url, timeout=2*60_000)
print('Navigated! Waiting captcha to detect and solve...')
result = await client.send('验证码.waitForSolve', {
'detectTimeout': 10 * 1000,
})
status = result['status']
print(f'验证码 status: {status}')
finally:
await browser.close()
async def main():
async with async_playwright() as playwright:
await scrape(playwright)
if __name__ == '__main__':
asyncio.run(main())
using PuppeteerSharp;
using System.Net.WebSockets;
using System.Text;
class Scraper
{
private string _auth;
public Scraper(string auth)
{
_auth = auth;
}
private async Task<IBrowser> Connect()
{
if (_auth == "USER:PASS")
{
throw new Exception("Provide Scraping Browsers credentials in AUTH"
+ " environment variable or update the script.");
}
var options = new ConnectOptions()
{
BrowserWSEndpoint = "wss://brd.superproxy.io:9222",
WebSocketFactory = async (uri, options, cToken) =>
{
var socket = new ClientWebSocket();
var authBytes = Encoding.UTF8.GetBytes(_auth);
var authHeader = "基础 " + Convert.ToBase64String(authBytes);
socket.Options.SetRequestHeader("Authorization", authHeader);
socket.Options.KeepAliveInterval = TimeSpan.Zero;
await socket.ConnectAsync(uri, cToken);
return socket;
},
};
return await Puppeteer.ConnectAsync(options);
}
public async Task Scrape(string url)
{
Log("Connecting to Browser...");
var browser = await Connect();
try {
Log($"Connected! Navigating to {url}...");
var page = await browser.NewPageAsync();
var client = await page.Target.CreateCDPSessionAsync();
await page.GoToAsync(url, /* timeout= */ 2 * 60 * 1000);
Log("Navigated! Waiting captcha to detect and solve...");
var result = await client.SendAsync("验证码.waitForSolve", new
{
detectTimeout = 10 * 1000,
});
var status = (string) result["status"]!;
Log($"验证码 status: {status}");
} finally {
await browser.CloseAsync();
}
}
private static string Env(string name, string defaultValue)
{
return Environment.GetEnvironmentVariable(name) ?? defaultValue;
}
private static void Log(string message)
{
Console.WriteLine(message);
}
public static async Task Main()
{
var auth = Env("AUTH", "USER:PASS");
var url = Env("TARGET_URL", "https://example.com");
var scraper = new Scraper(auth);
await scraper.Scrape(url);
}
}
Inspect scraping session, advanced scraping using js snippets
选择您喜欢的技术组合
#!/usr/bin/env node
const playwright = require('playwright');
const {
AUTH = 'USER:PASS',
TARGET_URL = 'https://example.com',
} = process.env;
async function scrape(url = TARGET_URL) {
if (AUTH == 'USER:PASS') {
throw new Error(`Provide Scraping Browsers credentials in AUTH`
+ ` environment variable or update the script.`);
}
console.log(`Connecting to Browser...`);
const endpointURL = `wss://${AUTH}@brd.superproxy.io:9222`;
const browser = await playwright.chromium.connectOverCDP(endpointURL);
try {
console.log(`Connected! Starting inspect session...`);
const page = await browser.newPage();
const client = await page.context().newCDPSession(page);
const { frameTree: { frame } } = await client.send('Page.getFrameTree');
const { url: inspectUrl } = await client.send('Page.inspect', {
frameId: frame.id,
});
console.log(`You can inspect this session at: ${inspectUrl}.`);
console.log(`Scraping will continue in 10 seconds...`);
await sleep(10);
console.log(`Navigating to ${url}...`);
await page.goto(url, { timeout: 2 * 60 * 1000 });
console.log(`Navigated! Scraping paragraphs...`);
const data = await page.$$eval('p', els => els.map(el => el.innerText));
console.log(`Scraped! Data:`, data);
console.log(`Session will be closed in 1 minute...`);
await sleep(60);
} finally {
console.log(`Closing session.`);
await browser.close();
}
}
function sleep(seconds) {
return new Promise(resolve => setTimeout(resolve, seconds * 1000));
}
if (require.main == module) {
scrape().catch(error => {
console.error(error.stack || error.message || error);
process.exit(1);
});
}
#!/usr/bin/env node
const playwright = require('playwright');
const {
AUTH = 'USER:PASS',
TARGET_URL = 'https://example.com',
} = process.env;
async function scrape(url = TARGET_URL) {
if (AUTH == 'USER:PASS') {
throw new Error(`Provide Scraping Browsers credentials in AUTH`
+ ` environment variable or update the script.`);
}
console.log(`Connecting to Browser...`);
const endpointURL = `wss://${AUTH}@brd.superproxy.io:9222`;
const browser = await playwright.chromium.connectOverCDP(endpointURL);
try {
console.log(`Connected! Starting inspect session...`);
const page = await browser.newPage();
const client = await page.context().newCDPSession(page);
const { frameTree: { frame } } = await client.send('Page.getFrameTree');
const { url: inspectUrl } = await client.send('Page.inspect', {
frameId: frame.id,
});
console.log(`You can inspect this session at: ${inspectUrl}.`);
console.log(`Scraping will continue in 10 seconds...`);
await sleep(10);
console.log(`Navigating to ${url}...`);
await page.goto(url, { timeout: 2 * 60 * 1000 });
console.log(`Navigated! Scraping paragraphs...`);
const data = await page.$$eval('p', els => els.map(el => el.innerText));
console.log(`Scraped! Data:`, data);
console.log(`Session will be closed in 1 minute...`);
await sleep(60);
} finally {
console.log(`Closing session.`);
await browser.close();
}
}
function sleep(seconds) {
return new Promise(resolve => setTimeout(resolve, seconds * 1000));
}
if (require.main == module) {
scrape().catch(error => {
console.error(error.stack || error.message || error);
process.exit(1);
});
}
#!/usr/bin/env python3
import asyncio
from os import environ
from playwright.async_api import Playwright, async_playwright
AUTH = environ.get('AUTH', default='USER:PASS')
TARGET_URL = environ.get('TARGET_URL', default='https://example.com')
async def scrape(playwright: Playwright, url=TARGET_URL):
if AUTH == 'USER:PASS':
raise Exception('Provide Scraping Browsers credentials in AUTH ' +
'environment variable or update the script.')
print('Connecting to Browser...')
endpoint_url = f'wss://{AUTH}@brd.superproxy.io:9222'
browser = await playwright.chromium.connect_over_cdp(endpoint_url)
try:
print('Connected! Starting inspect session...')
page = await browser.new_page()
client = await page.context.new_cdp_session(page)
frames = await client.send('Page.getFrameTree')
frame_id = frames['frameTree']['frame']['id']
inspect = await client.send('Page.inspect', {
'frameId': frame_id,
})
inspect_url = inspect['url']
print(f'You can inspect this session at: {inspect_url}.')
print('Scraping will continue in 10 seconds...')
await asyncio.sleep(10)
print(f'Navigating to {url}...')
await page.goto(url, timeout=2*60_000)
print('Navigated! Scraping paragraphs...')
data = await page.eval_on_selector_all(
'p', 'els => els.map(el => el.innerText)')
print('Scraped! Data', data)
print('Session will be closed in 1 minute...')
await asyncio.sleep(60)
finally:
print('Closing session.')
await browser.close()
async def main():
async with async_playwright() as playwright:
await scrape(playwright)
if __name__ == '__main__':
asyncio.run(main())
using PuppeteerSharp;
using System.Net.WebSockets;
using System.Text;
class Scraper
{
private string _auth;
public Scraper(string auth)
{
_auth = auth;
}
private async Task<IBrowser> Connect()
{
if (_auth == "USER:PASS")
{
throw new Exception("Provide Scraping Browsers credentials in AUTH"
+ " environment variable or update the script.");
}
var options = new ConnectOptions()
{
BrowserWSEndpoint = "wss://brd.superproxy.io:9222",
WebSocketFactory = async (uri, options, cToken) =>
{
var socket = new ClientWebSocket();
var authBytes = Encoding.UTF8.GetBytes(_auth);
var authHeader = "基础 " + Convert.ToBase64String(authBytes);
socket.Options.SetRequestHeader("Authorization", authHeader);
socket.Options.KeepAliveInterval = TimeSpan.Zero;
await socket.ConnectAsync(uri, cToken);
return socket;
},
};
return await Puppeteer.ConnectAsync(options);
}
public async Task Scrape(string url)
{
Log("Connecting to Browser...");
var browser = await Connect();
try {
Log("Connected! Starting inspect session...");
var page = await browser.NewPageAsync();
var client = await page.Target.CreateCDPSessionAsync();
var frames = await client.SendAsync("Page.getFrameTree");
var frameId = (string) frames!["frameTree"]!["frame"]!["id"]!;
var inspect = await client.SendAsync("Page.inspect",
new { frameId = frameId });
var inspectUrl = (string) inspect!["url"]!;
Log($"You can inspect this session at: {inspectUrl}");
Log("Scraping will continue in 10 seconds...");
await Task.Delay(10 * 1000);
Log($"Navigating to {url}...");
await page.GoToAsync(url, /* timeout= */ 2 * 60 * 1000);
Log("Navigated! Scraping paragraphs...");
var paragraphs = await page.QuerySelectorAllHandleAsync("p");
var data = await paragraphs.EvaluateFunctionAsync(
"els => els.map(el => el.innerText)");
Log($"Scraped! Data: {data}");
Log("Session will be closed in 1 minute...");
await Task.Delay(60 * 1000);
} finally {
Log("Closing session.");
await browser.CloseAsync();
}
}
private static string Env(string name, string defaultValue)
{
return Environment.GetEnvironmentVariable(name) ?? defaultValue;
}
private static void Log(string message)
{
Console.WriteLine(message);
}
public static async Task Main()
{
var auth = Env("AUTH", "USER:PASS");
var url = Env("TARGET_URL", "https://example.com");
var scraper = new Scraper(auth);
await scraper.Scrape(url);
}
}