[Add] browser-use and main.py

This commit is contained in:
tv0924@icloud.com 2025-05-18 21:57:54 +09:00
commit 96914d44ac
221 changed files with 30952 additions and 1 deletions

View file

@ -0,0 +1,421 @@
"""
Playwright browser on steroids.
"""
import asyncio
import gc
import logging
import os
import socket
import subprocess
from pathlib import Path
from tempfile import gettempdir
from typing import Literal
import httpx
import psutil
from dotenv import load_dotenv
from playwright.async_api import Browser as PlaywrightBrowser
from playwright.async_api import Playwright, async_playwright
from pydantic import AliasChoices, BaseModel, ConfigDict, Field
load_dotenv()
from browser_use.browser.chrome import (
CHROME_ARGS,
CHROME_DEBUG_PORT,
CHROME_DETERMINISTIC_RENDERING_ARGS,
CHROME_DISABLE_SECURITY_ARGS,
CHROME_DOCKER_ARGS,
CHROME_HEADLESS_ARGS,
)
from browser_use.browser.context import BrowserContext, BrowserContextConfig
from browser_use.browser.utils.screen_resolution import get_screen_resolution, get_window_adjustments
from browser_use.utils import time_execution_async
logger = logging.getLogger(__name__)
IN_DOCKER = os.environ.get('IN_DOCKER', 'false').lower()[0] in 'ty1'
class ProxySettings(BaseModel):
"""the same as playwright.sync_api.ProxySettings, but now as a Pydantic BaseModel so pydantic can validate it"""
server: str
bypass: str | None = None
username: str | None = None
password: str | None = None
model_config = ConfigDict(populate_by_name=True, from_attributes=True)
# Support dict-like behavior for compatibility with Playwright's ProxySettings
def __getitem__(self, key):
return getattr(self, key)
def get(self, key, default=None):
return getattr(self, key, default)
class BrowserConfig(BaseModel):
r"""
Configuration for the Browser.
Default values:
headless: False
Whether to run browser in headless mode (not recommended)
disable_security: False
Disable browser security features (required for cross-origin iframe support)
extra_browser_args: []
Extra arguments to pass to the browser
wss_url: None
Connect to a browser instance via WebSocket
cdp_url: None
Connect to a browser instance via CDP
browser_binary_path: None
Path to a Browser instance to use to connect to your normal browser
e.g. '/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome'
chrome_remote_debugging_port: 9222
Chrome remote debugging port to use to when browser_binary_path is supplied.
This allows running multiple chrome browsers with same browser_binary_path but running on different ports.
Also, makes it possible to launch new user provided chrome browser without closing already opened chrome instances,
by providing non-default chrome debugging port.
keep_alive: False
Keep the browser alive after the agent has finished running
deterministic_rendering: False
Enable deterministic rendering (makes GPU/font rendering consistent across different OS's and docker)
"""
model_config = ConfigDict(
arbitrary_types_allowed=True,
extra='ignore',
populate_by_name=True,
from_attributes=True,
validate_assignment=True,
revalidate_instances='subclass-instances',
)
wss_url: str | None = None
cdp_url: str | None = None
browser_class: Literal['chromium', 'firefox', 'webkit'] = 'chromium'
browser_binary_path: str | None = Field(
default=None, validation_alias=AliasChoices('browser_instance_path', 'chrome_instance_path')
)
chrome_remote_debugging_port: int | None = CHROME_DEBUG_PORT
extra_browser_args: list[str] = Field(default_factory=list)
headless: bool = False
disable_security: bool = False # disable_security=True is dangerous as any malicious URL visited could embed an iframe for the user's bank, and use their cookies to steal money
deterministic_rendering: bool = False
keep_alive: bool = Field(default=False, alias='_force_keep_browser_alive') # used to be called _force_keep_browser_alive
proxy: ProxySettings | None = None
new_context_config: BrowserContextConfig = Field(default_factory=BrowserContextConfig)
# @singleton: TODO - think about id singleton makes sense here
# @dev By default this is a singleton, but you can create multiple instances if you need to.
class Browser:
"""
Playwright browser on steroids.
This is persistent browser factory that can spawn multiple browser contexts.
It is recommended to use only one instance of Browser per your application (RAM usage will grow otherwise).
"""
def __init__(
self,
config: BrowserConfig | None = None,
):
logger.debug('🌎 Initializing new browser')
self.config = config or BrowserConfig()
self.playwright: Playwright | None = None
self.playwright_browser: PlaywrightBrowser | None = None
async def new_context(self, config: BrowserContextConfig | None = None) -> BrowserContext:
"""Create a browser context"""
browser_config = self.config.model_dump() if self.config else {}
context_config = config.model_dump() if config else {}
merged_config = {**browser_config, **context_config}
return BrowserContext(config=BrowserContextConfig(**merged_config), browser=self)
async def get_playwright_browser(self) -> PlaywrightBrowser:
"""Get a browser context"""
if self.playwright_browser is None:
return await self._init()
return self.playwright_browser
@time_execution_async('--init (browser)')
async def _init(self):
"""Initialize the browser session"""
playwright = await async_playwright().start()
self.playwright = playwright
browser = await self._setup_browser(playwright)
self.playwright_browser = browser
return self.playwright_browser
async def _setup_remote_cdp_browser(self, playwright: Playwright) -> PlaywrightBrowser:
"""Sets up and returns a Playwright Browser instance with anti-detection measures. Firefox has no longer CDP support."""
if 'firefox' in (self.config.browser_binary_path or '').lower():
raise ValueError(
'CDP has been deprecated for firefox, check: https://fxdx.dev/deprecating-cdp-support-in-firefox-embracing-the-future-with-webdriver-bidi/'
)
if not self.config.cdp_url:
raise ValueError('CDP URL is required')
logger.info(f'🔌 Connecting to remote browser via CDP {self.config.cdp_url}')
browser_class = getattr(playwright, self.config.browser_class)
browser = await browser_class.connect_over_cdp(self.config.cdp_url)
return browser
async def _setup_remote_wss_browser(self, playwright: Playwright) -> PlaywrightBrowser:
"""Sets up and returns a Playwright Browser instance with anti-detection measures."""
if not self.config.wss_url:
raise ValueError('WSS URL is required')
logger.info(f'🔌 Connecting to remote browser via WSS {self.config.wss_url}')
browser_class = getattr(playwright, self.config.browser_class)
browser = await browser_class.connect(self.config.wss_url)
return browser
async def _setup_user_provided_browser(self, playwright: Playwright) -> PlaywrightBrowser:
"""Sets up and returns a Playwright Browser instance with anti-detection measures."""
if not self.config.browser_binary_path:
raise ValueError('A browser_binary_path is required')
assert self.config.browser_class == 'chromium', (
'browser_binary_path only supports chromium browsers (make sure browser_class=chromium)'
)
try:
# Check if browser is already running
async with httpx.AsyncClient() as client:
response = await client.get(
f'http://localhost:{self.config.chrome_remote_debugging_port}/json/version', timeout=2
)
if response.status_code == 200:
logger.info(
f'🔌 Reusing existing browser found running on http://localhost:{self.config.chrome_remote_debugging_port}'
)
browser_class = getattr(playwright, self.config.browser_class)
browser = await browser_class.connect_over_cdp(
endpoint_url=f'http://localhost:{self.config.chrome_remote_debugging_port}',
timeout=20000, # 20 second timeout for connection
)
return browser
except httpx.RequestError:
logger.debug('🌎 No existing Chrome instance found, starting a new one')
provided_user_data_dir = [arg for arg in self.config.extra_browser_args if '--user-data-dir=' in arg]
if provided_user_data_dir:
user_data_dir = Path(provided_user_data_dir[0].split('=')[-1])
else:
fallback_user_data_dir = Path(gettempdir()) / 'browseruse' / 'profiles' / 'default' # /tmp/browseruse
try:
# ~/.config/browseruse/profiles/default
user_data_dir = Path('~/.config') / 'browseruse' / 'profiles' / 'default'
user_data_dir = user_data_dir.expanduser()
user_data_dir.mkdir(parents=True, exist_ok=True)
except Exception as e:
logger.error(f'❌ Failed to create ~/.config/browseruse directory: {type(e).__name__}: {e}')
user_data_dir = fallback_user_data_dir
user_data_dir.mkdir(parents=True, exist_ok=True)
logger.info(f'🌐 Storing Browser Profile user data dir in: {user_data_dir}')
try:
# Remove any existing SingletonLock file to allow the browser to start
(user_data_dir / 'Default' / 'SingletonLock').unlink()
self.config.extra_browser_args.append('--no-first-run')
except (FileNotFoundError, PermissionError, OSError):
pass
# Start a new Chrome instance
chrome_launch_args = [
*{ # remove duplicates (usually preserves the order, but not guaranteed)
f'--remote-debugging-port={self.config.chrome_remote_debugging_port}',
*([f'--user-data-dir={user_data_dir.resolve()}'] if not provided_user_data_dir else []),
*CHROME_ARGS,
*(CHROME_DOCKER_ARGS if IN_DOCKER else []),
*(CHROME_HEADLESS_ARGS if self.config.headless else []),
*(CHROME_DISABLE_SECURITY_ARGS if self.config.disable_security else []),
*(CHROME_DETERMINISTIC_RENDERING_ARGS if self.config.deterministic_rendering else []),
*self.config.extra_browser_args,
},
]
chrome_sub_process = await asyncio.create_subprocess_exec(
self.config.browser_binary_path,
*chrome_launch_args,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
shell=False,
)
self._chrome_subprocess = psutil.Process(chrome_sub_process.pid)
# Attempt to connect again after starting a new instance
for _ in range(10):
try:
async with httpx.AsyncClient() as client:
response = await client.get(
f'http://localhost:{self.config.chrome_remote_debugging_port}/json/version', timeout=2
)
if response.status_code == 200:
break
except httpx.RequestError:
pass
await asyncio.sleep(1)
# Attempt to connect again after starting a new instance
try:
browser_class = getattr(playwright, self.config.browser_class)
browser = await browser_class.connect_over_cdp(
endpoint_url=f'http://localhost:{self.config.chrome_remote_debugging_port}',
timeout=20000, # 20 second timeout for connection
)
return browser
except Exception as e:
logger.error(f'❌ Failed to start a new Chrome instance: {str(e)}')
raise RuntimeError(
'To start chrome in Debug mode, you need to close all existing Chrome instances and try again otherwise we can not connect to the instance.'
)
async def _setup_builtin_browser(self, playwright: Playwright) -> PlaywrightBrowser:
"""Sets up and returns a Playwright Browser instance with anti-detection measures."""
assert self.config.browser_binary_path is None, 'browser_binary_path should be None if trying to use the builtin browsers'
# Use the configured window size from new_context_config if available
if (
not self.config.headless
and hasattr(self.config, 'new_context_config')
and hasattr(self.config.new_context_config, 'window_width')
and hasattr(self.config.new_context_config, 'window_height')
and not self.config.new_context_config.no_viewport
):
screen_size = {
'width': self.config.new_context_config.window_width,
'height': self.config.new_context_config.window_height,
}
offset_x, offset_y = get_window_adjustments()
elif self.config.headless:
screen_size = {'width': 1920, 'height': 1080}
offset_x, offset_y = 0, 0
else:
screen_size = get_screen_resolution()
offset_x, offset_y = get_window_adjustments()
chrome_args = {
f'--remote-debugging-port={self.config.chrome_remote_debugging_port}',
*CHROME_ARGS,
*(CHROME_DOCKER_ARGS if IN_DOCKER else []),
*(CHROME_HEADLESS_ARGS if self.config.headless else []),
*(CHROME_DISABLE_SECURITY_ARGS if self.config.disable_security else []),
*(CHROME_DETERMINISTIC_RENDERING_ARGS if self.config.deterministic_rendering else []),
f'--window-position={offset_x},{offset_y}',
f'--window-size={screen_size["width"]},{screen_size["height"]}',
*self.config.extra_browser_args,
}
# check if chrome remote debugging port is already taken,
# if so remove the remote-debugging-port arg to prevent conflicts
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
if s.connect_ex(('localhost', self.config.chrome_remote_debugging_port)) == 0:
chrome_args.remove(f'--remote-debugging-port={self.config.chrome_remote_debugging_port}')
browser_class = getattr(playwright, self.config.browser_class)
args = {
'chromium': list(chrome_args),
'firefox': [
*{
'-no-remote',
*self.config.extra_browser_args,
}
],
'webkit': [
*{
'--no-startup-window',
*self.config.extra_browser_args,
}
],
}
browser = await browser_class.launch(
channel='chromium', # https://github.com/microsoft/playwright/issues/33566
headless=self.config.headless,
args=args[self.config.browser_class],
proxy=self.config.proxy.model_dump() if self.config.proxy else None,
handle_sigterm=False,
handle_sigint=False,
)
return browser
async def _setup_browser(self, playwright: Playwright) -> PlaywrightBrowser:
"""Sets up and returns a Playwright Browser instance with anti-detection measures."""
try:
if self.config.cdp_url:
return await self._setup_remote_cdp_browser(playwright)
if self.config.wss_url:
return await self._setup_remote_wss_browser(playwright)
if self.config.headless:
logger.warning('⚠️ Headless mode is not recommended. Many sites will detect and block all headless browsers.')
if self.config.browser_binary_path:
return await self._setup_user_provided_browser(playwright)
else:
return await self._setup_builtin_browser(playwright)
except Exception as e:
logger.error(f'Failed to initialize Playwright browser: {e}')
raise
async def close(self):
"""Close the browser instance"""
if self.config.keep_alive:
return
try:
if self.playwright_browser:
await self.playwright_browser.close()
del self.playwright_browser
if self.playwright:
await self.playwright.stop()
del self.playwright
if chrome_proc := getattr(self, '_chrome_subprocess', None):
try:
# always kill all children processes, otherwise chrome leaves a bunch of zombie processes
for proc in chrome_proc.children(recursive=True):
proc.kill()
chrome_proc.kill()
except Exception as e:
logger.debug(f'Failed to terminate chrome subprocess: {e}')
except Exception as e:
if 'OpenAI error' not in str(e):
logger.debug(f'Failed to close browser properly: {e}')
finally:
self.playwright_browser = None
self.playwright = None
self._chrome_subprocess = None
gc.collect()
def __del__(self):
"""Async cleanup when object is destroyed"""
try:
if self.playwright_browser or self.playwright:
loop = asyncio.get_running_loop()
if loop.is_running():
loop.create_task(self.close())
else:
asyncio.run(self.close())
except Exception as e:
logger.debug(f'Failed to cleanup browser in destructor: {e}')

View file

@ -0,0 +1,177 @@
CHROME_EXTENSIONS = {} # coming in a separate PR
CHROME_EXTENSIONS_PATH = 'chrome_extensions'
CHROME_PROFILE_PATH = 'chrome_profile'
CHROME_PROFILE_USER = 'Default'
CHROME_DEBUG_PORT = 9242
CHROME_DISABLED_COMPONENTS = [
'Translate',
'AcceptCHFrame',
'OptimizationHints',
'ProcessPerSiteUpToMainFrameThreshold',
'InterestFeedContentSuggestions',
# 'CalculateNativeWinOcclusion',
'BackForwardCache',
# 'HeavyAdPrivacyMitigations',
'LazyFrameLoading',
# 'ImprovedCookieControls',
'PrivacySandboxSettings4',
'AutofillServerCommunication',
'CertificateTransparencyComponentUpdater',
'DestroyProfileOnBrowserClose',
'CrashReporting',
'OverscrollHistoryNavigation',
'InfiniteSessionRestore',
#'LockProfileCookieDatabase', # disabling allows multiple chrome instances to concurrently modify profile, but might make chrome much slower https://github.com/yt-dlp/yt-dlp/issues/7271 https://issues.chromium.org/issues/40901624
] # it's always best to give each chrome instance its own exclusive copy of the user profile
CHROME_HEADLESS_ARGS = [
'--headless=new',
# '--test-type',
# '--test-type=gpu', # https://github.com/puppeteer/puppeteer/issues/10516
# '--enable-automation', # <- DONT USE THIS, it makes you easily detectable / blocked by cloudflare
]
CHROME_DOCKER_ARGS = [
# Docker-specific options
# https://github.com/GoogleChrome/lighthouse-ci/tree/main/docs/recipes/docker-client#--no-sandbox-issues-explained
'--no-sandbox', # rely on docker sandboxing in docker, otherwise we need cap_add: SYS_ADM to use host sandboxing
'--disable-gpu-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage', # docker 75mb default shm size is not big enough, disabling just uses /tmp instead
'--no-xshm',
# dont try to disable (or install) dbus in docker, its not needed, chrome can work without dbus despite the errors
]
CHROME_DISABLE_SECURITY_ARGS = [
# DANGER: JS isolation security features (to allow easier tampering with pages during automation)
# chrome://net-internals
'--disable-web-security', # <- WARNING, breaks some sites that expect/enforce strict CORS headers (try webflow.com)
'--disable-site-isolation-trials',
'--disable-features=IsolateOrigins,site-per-process',
# '--allow-file-access-from-files', # <- WARNING, dangerous, allows JS to read filesystem using file:// URLs
# DANGER: Disable HTTPS verification
'--allow-running-insecure-content', # Breaks CORS/CSRF/HSTS etc., useful sometimes but very easy to detect
'--ignore-certificate-errors',
'--ignore-ssl-errors',
'--ignore-certificate-errors-spki-list',
# '--allow-insecure-localhost',
]
# flags to make chrome behave more deterministically across different OS's
CHROME_DETERMINISTIC_RENDERING_ARGS = [
'--deterministic-mode',
'--js-flags=--random-seed=1157259159', # make all JS random numbers deterministic by providing a seed
'--force-device-scale-factor=1',
# GPU, canvas, text, and pdf rendering config
# chrome://gpu
'--enable-webgl', # enable web-gl graphics support
'--font-render-hinting=none', # make rendering more deterministic by ignoring OS font hints, may also need css override, try: * {text-rendering: geometricprecision !important; -webkit-font-smoothing: antialiased;}
'--force-color-profile=srgb', # make rendering more deterministic by using consistent color profile, if browser looks weird, try: generic-rgb
# '--disable-partial-raster', # make rendering more deterministic (TODO: verify if still needed)
'--disable-skia-runtime-opts', # make rendering more deterministic by avoiding Skia hot path runtime optimizations
'--disable-2d-canvas-clip-aa', # make rendering more deterministic by disabling antialiasing on 2d canvas clips
# '--disable-gpu', # falls back to more consistent software renderer across all OS's, especially helps linux text rendering look less weird
# // '--use-gl=swiftshader', <- DO NOT USE, breaks M1 ARM64. it makes rendering more deterministic by using simpler CPU renderer instead of OS GPU renderer bug: https://groups.google.com/a/chromium.org/g/chromium-dev/c/8eR2GctzGuw
# // '--disable-software-rasterizer', <- DO NOT USE, harmless, used in tandem with --disable-gpu
# // '--run-all-compositor-stages-before-draw', <- DO NOT USE, makes headful chrome hang on startup (tested v121 Google Chrome.app on macOS)
# // '--disable-gl-drawing-for-tests', <- DO NOT USE, disables gl output (makes tests run faster if you dont care about canvas)
# // '--blink-settings=imagesEnabled=false', <- DO NOT USE, disables images entirely (only sometimes useful to speed up loading)
]
CHROME_ARGS = [
# Process management & performance tuning
# chrome://process-internals
# '--disable-lazy-loading', # make rendering more deterministic by loading all content up-front instead of on-focus
# '--disable-renderer-backgrounding', # dont throttle tab rendering based on focus/visibility
# '--disable-background-networking', # dont throttle tab networking based on focus/visibility
# '--disable-background-timer-throttling', # dont throttle tab timers based on focus/visibility
# '--disable-backgrounding-occluded-windows', # dont throttle tab window based on focus/visibility
# '--disable-ipc-flooding-protection', # dont throttle ipc traffic or accessing big request/response/buffer/etc. objects will fail
# '--disable-extensions-http-throttling', # dont throttle http traffic based on runtime heuristics
# '--disable-field-trial-config', # disable shared field trial state between browser processes
# '--disable-back-forward-cache', # disable browsing navigation cache
# Profile data dir setup
# chrome://profile-internals
# f'--user-data-dir={CHROME_PROFILE_PATH}', # managed by playwright arg instead
# f'--profile-directory={CHROME_PROFILE_USER}',
# '--password-store=basic', # use mock keychain instead of OS-provided keychain (we manage auth.json instead)
# '--use-mock-keychain',
# '--disable-cookie-encryption', # we need to be able to write unencrypted cookies to save/load auth.json
'--disable-sync', # don't try to use Google account sync features while automation is active
# Extensions
# chrome://inspect/#extensions
# f'--load-extension={CHROME_EXTENSIONS.map(({unpacked_path}) => unpacked_path).join(',')}', # not needed when using existing profile that already has extensions installed
# f'--allowlisted-extension-id={",".join(CHROME_EXTENSIONS.keys())}',
'--allow-legacy-extension-manifests',
'--allow-pre-commit-input', # allow JS mutations before page rendering is complete
'--disable-blink-features=AutomationControlled', # hide the signatures that announce browser is being remote-controlled
# f'--proxy-server=https://43.159.28.126:2334:u7ce652b7568805c4-zone-custom-region-us-session-szGWq3FRU-sessTime-60:u7ce652b7568805c4', # send all network traffic through a proxy https://2captcha.com/proxy
# f'--proxy-bypass-list=127.0.0.1',
# Browser window and viewport setup
# chrome://version
# f'--user-agent="{DEFAULT_USER_AGENT}"',
# f'--window-size={DEFAULT_VIEWPORT.width},{DEFAULT_VIEWPORT.height}',
# '--window-position=0,0',
# '--start-maximized',
'--install-autogenerated-theme=0,0,0', # black border makes it easier to see which chrome window is browser-use's
'--hide-scrollbars', # stop scrollbars from affecting screenshot width/height
#'--virtual-time-budget=60000', # DONT USE THIS, makes chrome hang forever and doesn't work, used to fast-forward all animations & timers by 60s, dont use this it's unfortunately buggy and breaks screenshot and PDF capture sometimes
#'--autoplay-policy=no-user-gesture-required', # auto-start videos so they trigger network requests + show up in outputs
#'--disable-gesture-requirement-for-media-playback',
#'--lang=en-US,en;q=0.9',
# IO: stdin/stdout, debug port config
# chrome://inspect
'--log-level=2', # 1=DEBUG 2=WARNING 3=ERROR
'--enable-logging=stderr',
# '--remote-debugging-address=127.0.0.1', <- DONT USE THIS, no longer supported on chrome >100, never expose to non-localhost, would allow attacker to drive your browser from any machine
# '--enable-experimental-extension-apis', # add support for tab groups via chrome.tabs extension API
'--disable-focus-on-load', # prevent browser from hijacking focus
'--disable-window-activation',
# '--in-process-gpu', <- DONT USE THIS, makes headful startup time ~5-10s slower (tested v121 Google Chrome.app on macOS)
# '--disable-component-extensions-with-background-pages', # TODO: check this, disables chrome components that only run in background with no visible UI (could lower startup time)
# uncomment to disable hardware camera/mic/speaker access + present fake devices to websites
# (faster to disable, but disabling breaks recording browser audio in puppeteer-stream screenrecordings)
# '--use-fake-device-for-media-stream',
# '--use-fake-ui-for-media-stream',
# '--disable-features=GlobalMediaControls,MediaRouter,DialMediaRouteProvider',
# Output format options (PDF, screenshot, etc.)
'--export-tagged-pdf', # include table on contents and tags in printed PDFs
'--generate-pdf-document-outline',
# Suppress first-run features, popups, hints, updates, etc.
# chrome://system
'--no-pings',
'--no-default-browser-check',
'--no-startup-window',
'--ash-no-nudges',
'--disable-infobars',
'--disable-search-engine-choice-screen',
'--disable-session-crashed-bubble',
'--simulate-outdated-no-au="Tue, 31 Dec 2099 23:59:59 GMT"', # disable browser self-update while automation is active
'--hide-crash-restore-bubble',
'--suppress-message-center-popups',
'--disable-client-side-phishing-detection',
'--disable-domain-reliability',
'--disable-datasaver-prompt',
'--disable-hang-monitor',
'--disable-session-crashed-bubble',
'--disable-speech-synthesis-api',
'--disable-speech-api',
'--disable-print-preview',
'--safebrowsing-disable-auto-update',
# '--deny-permission-prompts',
'--disable-external-intent-requests',
# '--disable-notifications',
'--disable-desktop-notifications',
'--noerrdialogs',
'--disable-prompt-on-repost',
'--silent-debugger-extension-api',
# '--block-new-web-contents',
'--metrics-recording-only',
'--disable-breakpad',
# other feature flags
# chrome://flags chrome://components
f'--disable-features={",".join(CHROME_DISABLED_COMPONENTS)}',
'--enable-features=NetworkService',
]

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,348 @@
import logging
import os
import aiohttp
from playwright.async_api import Page, async_playwright
from browser_use.browser.service import Browser
from browser_use.browser.views import BrowserState, TabInfo
logger = logging.getLogger(__name__)
class DolphinBrowser(Browser):
"""A class for managing Dolphin Anty browser sessions using Playwright"""
def __init__(self, headless: bool = False, keep_open: bool = False):
"""
Initialize the DolphinBrowser instance.
Args:
headless (bool): Run browser in headless mode (default: False).
keep_open (bool): Keep browser open after finishing tasks (default: False).
"""
# Retrieve environment variables for API connection
self.api_token = os.getenv('DOLPHIN_API_TOKEN')
self.api_url = os.getenv('DOLPHIN_API_URL', 'http://localhost:3001/v1.0')
self.profile_id = os.getenv('DOLPHIN_PROFILE_ID')
# Initialize internal attributes
self.playwright = None
self.browser = None
self.context = None
self.page = None
self.headless = headless
self.keep_open = keep_open
self._pages: list[Page] = [] # List to store open pages
self.session = None
self.cached_state = None
async def get_current_page(self) -> Page:
"""
Get the currently active page.
Raises:
Exception: If no active page is available.
"""
if not self.page:
raise Exception('No active page. Browser might not be connected.')
return self.page
async def create_new_tab(self, url: str | None = None) -> None:
"""
Create a new tab and optionally navigate to a given URL.
Args:
url (str, optional): URL to navigate to after creating the tab. Defaults to None.
Raises:
Exception: If browser context is not initialized or navigation fails.
"""
if not self.context:
raise Exception('Browser context not initialized')
# Create new page (tab) in the current browser context
new_page = await self.context.new_page()
self._pages.append(new_page)
self.page = new_page # Set as current page
if url:
try:
# Navigate to the URL and wait for the page to load
await new_page.goto(url, wait_until='networkidle')
await self.wait_for_page_load()
except Exception as e:
logger.error(f'Failed to navigate to URL {url}: {str(e)}')
raise
async def switch_to_tab(self, page_id: int) -> None:
"""
Switch to a specific tab by its page ID.
Args:
page_id (int): The index of the tab to switch to.
Raises:
Exception: If the tab index is out of range or no tabs are available.
"""
if not self._pages:
raise Exception('No tabs available')
# Handle negative indices (e.g., -1 for last tab)
if page_id < 0:
page_id = len(self._pages) + page_id
if page_id >= len(self._pages) or page_id < 0:
raise Exception(f'Tab index {page_id} out of range')
# Set the current page to the selected tab
self.page = self._pages[page_id]
await self.page.bring_to_front() # Bring tab to the front
await self.wait_for_page_load()
async def get_tabs_info(self) -> list[TabInfo]:
"""
Get information about all open tabs.
Returns:
list: A list of TabInfo objects containing details about each tab.
"""
tabs_info = []
for idx, page in enumerate(self._pages):
tab_info = TabInfo(
page_id=idx,
url=page.url,
title=await page.title(), # Fetch the title of the page
)
tabs_info.append(tab_info)
return tabs_info
async def wait_for_page_load(self, timeout: int = 30000):
"""
Wait for the page to load completely.
Args:
timeout (int): Maximum time to wait for page load in milliseconds (default: 30000ms).
Raises:
Exception: If the page fails to load within the specified timeout.
"""
if self.page:
try:
await self.page.wait_for_load_state('networkidle', timeout=timeout)
except Exception as e:
logger.warning(f'Wait for page load timeout: {str(e)}')
async def get_session(self):
"""
Get the current session.
Returns:
DolphinBrowser: The current DolphinBrowser instance.
Raises:
Exception: If the browser is not connected.
"""
if not self.browser:
raise Exception('Browser not connected. Call connect() first.')
self.session = self
return self
async def authenticate(self):
"""
Authenticate with Dolphin Anty API using the API token.
Raises:
Exception: If authentication fails.
"""
async with aiohttp.ClientSession() as session:
auth_url = f'{self.api_url}/auth/login-with-token'
auth_data = {'token': self.api_token}
async with session.post(auth_url, json=auth_data) as response:
if not response.ok:
raise Exception(f'Failed to authenticate with Dolphin Anty: {await response.text()}')
return await response.json()
async def get_browser_profiles(self):
"""
Get a list of available browser profiles from Dolphin Anty.
Returns:
list: A list of browser profiles.
Raises:
Exception: If fetching the browser profiles fails.
"""
# Authenticate before fetching profiles
await self.authenticate()
async with aiohttp.ClientSession() as session:
headers = {'Authorization': f'Bearer {self.api_token}'}
async with session.get(f'{self.api_url}/browser_profiles', headers=headers) as response:
if not response.ok:
raise Exception(f'Failed to get browser profiles: {await response.text()}')
data = await response.json()
return data.get('data', []) # Return the profiles array from the response
async def start_profile(self, profile_id: str | None = None, headless: bool = False) -> dict:
"""
Start a browser profile on Dolphin Anty.
Args:
profile_id (str, optional): Profile ID to start (defaults to the one set in the environment).
headless (bool): Run browser in headless mode (default: False).
Returns:
dict: Information about the started profile.
Raises:
ValueError: If no profile ID is provided and no default is set.
Exception: If starting the profile fails.
"""
# Authenticate before starting the profile
await self.authenticate()
profile_id = profile_id or self.profile_id
if not profile_id:
raise ValueError('No profile ID provided')
url = f'{self.api_url}/browser_profiles/{profile_id}/start'
params = {'automation': 1}
if headless:
params['headless'] = 1
async with aiohttp.ClientSession() as session:
async with session.get(url, params=params) as response:
if not response.ok:
raise Exception(f'Failed to start profile: {await response.text()}')
return await response.json()
async def stop_profile(self, profile_id: str | None = None):
"""
Stop a browser profile on Dolphin Anty.
Args:
profile_id (str, optional): Profile ID to stop (defaults to the one set in the environment).
Returns:
dict: Information about the stopped profile.
Raises:
ValueError: If no profile ID is provided and no default is set.
"""
# Authenticate before stopping the profile
await self.authenticate()
profile_id = profile_id or self.profile_id
if not profile_id:
raise ValueError('No profile ID provided')
url = f'{self.api_url}/browser_profiles/{profile_id}/stop'
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
return await response.json()
async def connect(self, profile_id: str | None = None):
"""
Connect to a running browser profile using Playwright.
Args:
profile_id (str, optional): Profile ID to connect to (defaults to the one set in the environment).
Returns:
PlaywrightBrowser: The connected browser instance.
Raises:
Exception: If authentication or profile connection fails.
"""
# Authenticate before connecting to the profile
await self.authenticate()
# Start the browser profile
profile_data = await self.start_profile(profile_id)
if not profile_data.get('success'):
raise Exception(f'Failed to start profile: {profile_data}')
automation = profile_data['automation']
port = automation['port']
ws_endpoint = automation['wsEndpoint']
ws_url = f'ws://127.0.0.1:{port}{ws_endpoint}'
# Use Playwright to connect to the browser's WebSocket endpoint
self.playwright = await async_playwright().start()
self.browser = await self.playwright.chromium.connect_over_cdp(ws_url)
# Get or create a browser context and page
contexts = self.browser.contexts
self.context = contexts[0] if contexts else await self.browser.new_context()
pages = self.context.pages
self.page = pages[0] if pages else await self.context.new_page()
self._pages = [self.page] # Initialize pages list with the first page
return self.browser
async def close(self, force: bool = False):
"""
Close the browser connection and clean up resources.
Args:
force (bool): If True, forcefully stop the associated profile (default: False).
"""
try:
# Close all open pages
if self._pages:
for page in self._pages:
try:
await page.close()
except BaseException:
pass
self._pages = []
# Close the browser and Playwright instance
if self.browser:
await self.browser.close()
if self.playwright:
await self.playwright.stop()
if force:
await self.stop_profile() # Force stop the profile
except Exception as e:
logger.error(f'Error during browser cleanup: {str(e)}')
async def get_current_state(self) -> BrowserState:
"""
Get the current state of the browser (URL, content, viewport size, tabs).
Returns:
BrowserState: The current state of the browser.
Raises:
Exception: If no active page is available.
"""
if not self.page:
raise Exception('No active page')
# Get page content and viewport size
content = await self.page.content()
viewport_size = await self.page.viewport_size()
# Create and return the current browser state
state = BrowserState(
url=self.page.url,
content=content,
viewport_height=viewport_size['height'] if viewport_size else 0,
viewport_width=viewport_size['width'] if viewport_size else 0,
tabs=await self.get_tabs_info(),
)
# Cache and return the state
self.cached_state = state
return state
def __del__(self):
"""Clean up resources when the DolphinBrowser instance is deleted."""
# No need to handle session cleanup as we're using self as session
pass

View file

@ -0,0 +1,39 @@
import httpx
import pytest
from browser_use.browser.browser import Browser, BrowserConfig
@pytest.mark.asyncio
async def test_browser_close_doesnt_affect_external_httpx_clients():
"""
Test that Browser.close() doesn't close HTTPX clients created outside the Browser instance.
This test demonstrates the issue where Browser.close() is closing all HTTPX clients.
"""
# Create an external HTTPX client that should remain open
external_client = httpx.AsyncClient()
# Create a Browser instance
browser = Browser(config=BrowserConfig(headless=True))
# Close the browser (which should trigger cleanup_httpx_clients)
await browser.close()
# Check if the external client is still usable
try:
# If the client is closed, this will raise RuntimeError
# Using a simple HEAD request to a reliable URL
await external_client.head('https://www.example.com', timeout=2.0)
client_is_closed = False
except RuntimeError as e:
# If we get "Cannot send a request, as the client has been closed"
client_is_closed = 'client has been closed' in str(e)
except Exception:
# Any other exception means the client is not closed but request failed
client_is_closed = False
finally:
# Always clean up our test client properly
await external_client.aclose()
# Our external client should not be closed by browser.close()
assert not client_is_closed, 'External HTTPX client was incorrectly closed by Browser.close()'

View file

@ -0,0 +1,36 @@
import asyncio
import base64
import pytest
from browser_use.browser.browser import Browser, BrowserConfig
async def test_take_full_page_screenshot():
browser = Browser(config=BrowserConfig(headless=False, disable_security=True))
try:
async with await browser.new_context() as context:
page = await context.get_current_page()
# Go to a test page
await page.goto('https://example.com')
await asyncio.sleep(3)
# Take full page screenshot
screenshot_b64 = await context.take_screenshot(full_page=True)
await asyncio.sleep(3)
# Verify screenshot is not empty and is valid base64
assert screenshot_b64 is not None
assert isinstance(screenshot_b64, str)
assert len(screenshot_b64) > 0
# Test we can decode the base64 string
try:
base64.b64decode(screenshot_b64)
except Exception as e:
pytest.fail(f'Failed to decode base64 screenshot: {str(e)}')
finally:
await browser.close()
if __name__ == '__main__':
asyncio.run(test_take_full_page_screenshot())

View file

@ -0,0 +1,96 @@
import asyncio
import json
import anyio
import pytest
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.dom.views import DOMBaseNode, DOMElementNode, DOMTextNode
from browser_use.utils import time_execution_sync
class ElementTreeSerializer:
@staticmethod
def dom_element_node_to_json(element_tree: DOMElementNode) -> dict:
def node_to_dict(node: DOMBaseNode) -> dict:
if isinstance(node, DOMTextNode):
return {'type': 'text', 'text': node.text}
elif isinstance(node, DOMElementNode):
return {
'type': 'element',
'tag_name': node.tag_name,
'attributes': node.attributes,
'highlight_index': node.highlight_index,
'children': [node_to_dict(child) for child in node.children],
}
return {}
return node_to_dict(element_tree)
# run with: pytest browser_use/browser/tests/test_clicks.py
@pytest.mark.asyncio
async def test_highlight_elements():
browser = Browser(config=BrowserConfig(headless=False, disable_security=True))
async with await browser.new_context() as context:
page = await context.get_current_page()
# await page.goto('https://immobilienscout24.de')
# await page.goto('https://help.sap.com/docs/sap-ai-core/sap-ai-core-service-guide/service-plans')
# await page.goto('https://google.com/search?q=elon+musk')
# await page.goto('https://kayak.com')
# await page.goto('https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe')
# await page.goto('https://dictionary.cambridge.org')
# await page.goto('https://github.com')
await page.goto('https://huggingface.co/')
await asyncio.sleep(1)
while True:
try:
# await asyncio.sleep(10)
state = await context.get_state(True)
async with await anyio.open_file('./tmp/page.json', 'w') as f:
await f.write(
json.dumps(
ElementTreeSerializer.dom_element_node_to_json(state.element_tree),
indent=1,
)
)
# await time_execution_sync('highlight_selector_map_elements')(
# browser.highlight_selector_map_elements
# )(state.selector_map)
# Find and print duplicate XPaths
xpath_counts = {}
if not state.selector_map:
continue
for selector in state.selector_map.values():
xpath = selector.xpath
if xpath in xpath_counts:
xpath_counts[xpath] += 1
else:
xpath_counts[xpath] = 1
print('\nDuplicate XPaths found:')
for xpath, count in xpath_counts.items():
if count > 1:
print(f'XPath: {xpath}')
print(f'Count: {count}\n')
print(list(state.selector_map.keys()), 'Selector map keys')
print(state.element_tree.clickable_elements_to_string())
action = input('Select next action: ')
await time_execution_sync('remove_highlight_elements')(context.remove_highlights)()
node_element = state.selector_map[int(action)]
# check if index of selector map are the same as index of items in dom_items
await context._click_element_node(node_element)
except Exception as e:
print(e)

View file

@ -0,0 +1,41 @@
import sys
def get_screen_resolution():
if sys.platform == 'darwin': # macOS
try:
from AppKit import NSScreen
screen = NSScreen.mainScreen().frame()
return {'width': int(screen.size.width), 'height': int(screen.size.height)}
except ImportError:
print('AppKit is not available. Make sure you are running this on macOS with pyobjc installed.')
except Exception as e:
print(f'Error retrieving macOS screen resolution: {e}')
return {'width': 2560, 'height': 1664}
else: # Windows & Linux
try:
from screeninfo import get_monitors
monitors = get_monitors()
if not monitors:
raise Exception('No monitors detected.')
monitor = monitors[0]
return {'width': monitor.width, 'height': monitor.height}
except ImportError:
print("screeninfo package not found. Install it using 'pip install screeninfo'.")
except Exception as e:
print(f'Error retrieving screen resolution: {e}')
return {'width': 1920, 'height': 1080}
def get_window_adjustments():
"""Returns recommended x, y offsets for window positioning"""
if sys.platform == 'darwin': # macOS
return -4, 24 # macOS has a small title bar, no border
elif sys.platform == 'win32': # Windows
return -8, 0 # Windows has a border on the left
else: # Linux
return 0, 0

View file

@ -0,0 +1,54 @@
from dataclasses import dataclass, field
from typing import Any
from pydantic import BaseModel
from browser_use.dom.history_tree_processor.service import DOMHistoryElement
from browser_use.dom.views import DOMState
# Pydantic
class TabInfo(BaseModel):
"""Represents information about a browser tab"""
page_id: int
url: str
title: str
parent_page_id: int | None = None # parent page that contains this popup or cross-origin iframe
@dataclass
class BrowserState(DOMState):
url: str
title: str
tabs: list[TabInfo]
screenshot: str | None = None
pixels_above: int = 0
pixels_below: int = 0
browser_errors: list[str] = field(default_factory=list)
@dataclass
class BrowserStateHistory:
url: str
title: str
tabs: list[TabInfo]
interacted_element: list[DOMHistoryElement | None] | list[None]
screenshot: str | None = None
def to_dict(self) -> dict[str, Any]:
data = {}
data['tabs'] = [tab.model_dump() for tab in self.tabs]
data['screenshot'] = self.screenshot
data['interacted_element'] = [el.to_dict() if el else None for el in self.interacted_element]
data['url'] = self.url
data['title'] = self.title
return data
class BrowserError(Exception):
"""Base class for all browser errors"""
class URLNotAllowedError(BrowserError):
"""Error raised when a URL is not allowed"""