[Add] browser-use and main.py

This commit is contained in:
tv0924@icloud.com 2025-05-18 21:57:54 +09:00
commit 96914d44ac
221 changed files with 30952 additions and 1 deletions

View file

@ -0,0 +1,59 @@
"""
Test configuration for browser-use.
"""
import logging
import os
import sys
import pytest
from langchain_openai import ChatOpenAI
from pydantic import SecretStr
# Ensure the project root is in the Python path
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
# Load environment variables
from dotenv import load_dotenv
load_dotenv()
# Configure logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import BrowserContext
@pytest.fixture(scope='session')
def llm():
"""
Fixture to provide a ChatOpenAI instance or a mock for testing.
Uses a mock if OPENAI_API_KEY is not set.
"""
api_key = os.getenv('OPENAI_API_KEY')
logger.debug(f'API Key present: {bool(api_key)}')
logger.debug('Using actual ChatOpenAI model')
return ChatOpenAI(model='gpt-4o', api_key=SecretStr(api_key) if api_key else None)
@pytest.fixture(scope='session')
def browser():
"""
Fixture to provide a Browser instance for testing.
"""
logger.debug('Creating Browser instance for testing')
return Browser(config=BrowserConfig(headless=True, disable_security=True))
@pytest.fixture(scope='function')
async def browser_context(browser):
"""
Fixture to provide a BrowserContext instance for testing.
"""
logger.debug('Creating BrowserContext instance for testing')
context = BrowserContext(browser=browser)
yield context
await context.close()

View file

@ -0,0 +1,305 @@
from unittest.mock import MagicMock
import pytest
from playwright.async_api import Page
from pydantic import BaseModel
from browser_use.controller.registry.service import Registry
from browser_use.controller.registry.views import ActionRegistry, RegisteredAction
class EmptyParamModel(BaseModel):
pass
class TestActionFilters:
def test_get_prompt_description_no_filters(self):
"""Test that system prompt only includes actions with no filters"""
registry = ActionRegistry()
# Add actions with and without filters
no_filter_action = RegisteredAction(
name='no_filter_action',
description='Action with no filters',
function=lambda: None,
param_model=EmptyParamModel,
domains=None,
page_filter=None,
)
page_filter_action = RegisteredAction(
name='page_filter_action',
description='Action with page filter',
function=lambda: None,
param_model=EmptyParamModel,
domains=None,
page_filter=lambda page: True,
)
domain_filter_action = RegisteredAction(
name='domain_filter_action',
description='Action with domain filter',
function=lambda: None,
param_model=EmptyParamModel,
domains=['example.com'],
page_filter=None,
)
registry.actions = {
'no_filter_action': no_filter_action,
'page_filter_action': page_filter_action,
'domain_filter_action': domain_filter_action,
}
# System prompt (no page) should only include actions with no filters
system_description = registry.get_prompt_description()
assert 'no_filter_action' in system_description
assert 'page_filter_action' not in system_description
assert 'domain_filter_action' not in system_description
def test_page_filter_matching(self):
"""Test that page filters work correctly"""
registry = ActionRegistry()
# Create a mock page
mock_page = MagicMock(spec=Page)
mock_page.url = 'https://example.com/page'
# Create actions with different page filters
matching_action = RegisteredAction(
name='matching_action',
description='Action with matching page filter',
function=lambda: None,
param_model=EmptyParamModel,
domains=None,
page_filter=lambda page: 'example.com' in page.url,
)
non_matching_action = RegisteredAction(
name='non_matching_action',
description='Action with non-matching page filter',
function=lambda: None,
param_model=EmptyParamModel,
domains=None,
page_filter=lambda page: 'other.com' in page.url,
)
registry.actions = {'matching_action': matching_action, 'non_matching_action': non_matching_action}
# Page-specific description should only include matching actions
page_description = registry.get_prompt_description(mock_page)
assert 'matching_action' in page_description
assert 'non_matching_action' not in page_description
def test_domain_filter_matching(self):
"""Test that domain filters work correctly with glob patterns"""
registry = ActionRegistry()
# Create actions with different domain patterns
actions = {
'exact_match': RegisteredAction(
name='exact_match',
description='Exact domain match',
function=lambda: None,
param_model=EmptyParamModel,
domains=['example.com'],
page_filter=None,
),
'subdomain_match': RegisteredAction(
name='subdomain_match',
description='Subdomain wildcard match',
function=lambda: None,
param_model=EmptyParamModel,
domains=['*.example.com'],
page_filter=None,
),
'prefix_match': RegisteredAction(
name='prefix_match',
description='Prefix wildcard match',
function=lambda: None,
param_model=EmptyParamModel,
domains=['example*'],
page_filter=None,
),
'non_matching': RegisteredAction(
name='non_matching',
description='Non-matching domain',
function=lambda: None,
param_model=EmptyParamModel,
domains=['other.com'],
page_filter=None,
),
}
registry.actions = actions
# Test exact domain match
mock_page = MagicMock(spec=Page)
mock_page.url = 'https://example.com/page'
exact_match_description = registry.get_prompt_description(mock_page)
assert 'exact_match' in exact_match_description
assert 'non_matching' not in exact_match_description
# Test subdomain match
mock_page.url = 'https://sub.example.com/page'
subdomain_match_description = registry.get_prompt_description(mock_page)
assert 'subdomain_match' in subdomain_match_description
assert 'exact_match' not in subdomain_match_description
# Test prefix match
mock_page.url = 'https://example123.org/page'
prefix_match_description = registry.get_prompt_description(mock_page)
assert 'prefix_match' in prefix_match_description
def test_domain_and_page_filter_together(self):
"""Test that actions can be filtered by both domain and page filter"""
registry = ActionRegistry()
# Create a mock page
mock_page = MagicMock(spec=Page)
mock_page.url = 'https://example.com/admin'
# Actions with different combinations of filters
actions = {
'domain_only': RegisteredAction(
name='domain_only',
description='Domain filter only',
function=lambda: None,
param_model=EmptyParamModel,
domains=['example.com'],
page_filter=None,
),
'page_only': RegisteredAction(
name='page_only',
description='Page filter only',
function=lambda: None,
param_model=EmptyParamModel,
domains=None,
page_filter=lambda page: 'admin' in page.url,
),
'both_matching': RegisteredAction(
name='both_matching',
description='Both filters matching',
function=lambda: None,
param_model=EmptyParamModel,
domains=['example.com'],
page_filter=lambda page: 'admin' in page.url,
),
'both_one_fail': RegisteredAction(
name='both_one_fail',
description='One filter fails',
function=lambda: None,
param_model=EmptyParamModel,
domains=['other.com'],
page_filter=lambda page: 'admin' in page.url,
),
}
registry.actions = actions
# Check that only actions with matching filters are included
description = registry.get_prompt_description(mock_page)
assert 'domain_only' in description # Domain matches
assert 'page_only' in description # Page filter matches
assert 'both_matching' in description # Both filters match
assert 'both_one_fail' not in description # Domain filter fails
# Test with different URL where page filter fails
mock_page.url = 'https://example.com/dashboard'
description = registry.get_prompt_description(mock_page)
assert 'domain_only' in description # Domain matches
assert 'page_only' not in description # Page filter fails
assert 'both_matching' not in description # Page filter fails
assert 'both_one_fail' not in description # Domain filter fails
@pytest.mark.asyncio
async def test_registry_action_decorator(self):
"""Test the action decorator with filters"""
registry = Registry()
# Define actions with different filters
@registry.action(
description='No filter action',
)
def no_filter_action():
pass
@registry.action(description='Domain filter action', domains=['example.com'])
def domain_filter_action():
pass
@registry.action(description='Page filter action', page_filter=lambda page: 'admin' in page.url)
def page_filter_action():
pass
# Check that system prompt only includes the no_filter_action
system_description = registry.get_prompt_description()
assert 'No filter action' in system_description
assert 'Domain filter action' not in system_description
assert 'Page filter action' not in system_description
# Check that page-specific prompt includes the right actions
mock_page = MagicMock(spec=Page)
mock_page.url = 'https://example.com/admin'
page_description = registry.get_prompt_description(mock_page)
assert 'Domain filter action' in page_description
assert 'Page filter action' in page_description
@pytest.mark.asyncio
async def test_action_model_creation(self):
"""Test that action models are created correctly with filters"""
registry = Registry()
# Define actions with different filters
@registry.action(
description='No filter action',
)
def no_filter_action():
pass
@registry.action(description='Domain filter action', domains=['example.com'])
def domain_filter_action():
pass
@registry.action(description='Page filter action', page_filter=lambda page: 'admin' in page.url)
def page_filter_action():
pass
@registry.action(description='Both filters action', domains=['example.com'], page_filter=lambda page: 'admin' in page.url)
def both_filters_action():
pass
# Initial action model should only include no_filter_action
initial_model = registry.create_action_model()
assert 'no_filter_action' in initial_model.model_fields
assert 'domain_filter_action' not in initial_model.model_fields
assert 'page_filter_action' not in initial_model.model_fields
assert 'both_filters_action' not in initial_model.model_fields
# Action model with matching page should include all matching actions
mock_page = MagicMock(spec=Page)
mock_page.url = 'https://example.com/admin'
page_model = registry.create_action_model(page=mock_page)
assert 'no_filter_action' in page_model.model_fields
assert 'domain_filter_action' in page_model.model_fields
assert 'page_filter_action' in page_model.model_fields
assert 'both_filters_action' in page_model.model_fields
# Action model with non-matching domain should exclude domain-filtered actions
mock_page.url = 'https://other.com/admin'
non_matching_domain_model = registry.create_action_model(page=mock_page)
assert 'no_filter_action' in non_matching_domain_model.model_fields
assert 'domain_filter_action' not in non_matching_domain_model.model_fields
assert 'page_filter_action' in non_matching_domain_model.model_fields
assert 'both_filters_action' not in non_matching_domain_model.model_fields
# Action model with non-matching page filter should exclude page-filtered actions
mock_page.url = 'https://example.com/dashboard'
non_matching_page_model = registry.create_action_model(page=mock_page)
assert 'no_filter_action' in non_matching_page_model.model_fields
assert 'domain_filter_action' in non_matching_page_model.model_fields
assert 'page_filter_action' not in non_matching_page_model.model_fields
assert 'both_filters_action' not in non_matching_page_model.model_fields

View file

@ -0,0 +1,220 @@
import asyncio
import os
import pytest
from langchain_openai import AzureChatOpenAI
from pydantic import BaseModel, SecretStr
from browser_use.agent.service import Agent
from browser_use.agent.views import AgentHistoryList
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.views import BrowserState
@pytest.fixture
def llm():
"""Initialize language model for testing"""
# return ChatAnthropic(model_name='claude-3-5-sonnet-20240620', timeout=25, stop=None)
return AzureChatOpenAI(
model='gpt-4o',
api_version='2024-10-21',
azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT', ''),
api_key=SecretStr(os.getenv('AZURE_OPENAI_KEY', '')),
)
# return ChatOpenAI(model='gpt-4o-mini')
@pytest.fixture(scope='session')
def event_loop():
"""Create an instance of the default event loop for each test case."""
loop = asyncio.get_event_loop_policy().new_event_loop()
yield loop
loop.close()
@pytest.fixture(scope='session')
async def browser(event_loop):
browser_instance = Browser(
config=BrowserConfig(
headless=True,
)
)
yield browser_instance
await browser_instance.close()
@pytest.fixture
async def context(browser):
async with await browser.new_context() as context:
yield context
# Clean up automatically happens with __aexit__
# pytest tests/test_agent_actions.py -v -k "test_ecommerce_interaction" --capture=no
# @pytest.mark.asyncio
@pytest.mark.skip(reason='Kinda expensive to run')
async def test_ecommerce_interaction(llm, context):
"""Test complex ecommerce interaction sequence"""
agent = Agent(
task="Go to amazon.com, search for 'laptop', filter by 4+ stars, and find the price of the first result",
llm=llm,
browser_context=context,
save_conversation_path='tmp/test_ecommerce_interaction/conversation',
)
history: AgentHistoryList = await agent.run(max_steps=20)
# Verify sequence of actions
action_sequence = []
for action in history.model_actions():
action_name = list(action.keys())[0]
if action_name in ['go_to_url', 'open_tab']:
action_sequence.append('navigate')
elif action_name == 'input_text':
action_sequence.append('input')
# Check that the input is 'laptop'
inp = action['input_text']['text'].lower() # type: ignore
if inp == 'laptop':
action_sequence.append('input_exact_correct')
elif 'laptop' in inp:
action_sequence.append('correct_in_input')
else:
action_sequence.append('incorrect_input')
elif action_name == 'click_element':
action_sequence.append('click')
# Verify essential steps were performed
assert 'navigate' in action_sequence # Navigated to Amazon
assert 'input' in action_sequence # Entered search term
assert 'click' in action_sequence # Clicked search/filter
assert 'input_exact_correct' in action_sequence or 'correct_in_input' in action_sequence
# @pytest.mark.asyncio
async def test_error_recovery(llm, context):
"""Test agent's ability to recover from errors"""
agent = Agent(
task='Navigate to nonexistent-site.com and then recover by going to google.com ',
llm=llm,
browser_context=context,
)
history: AgentHistoryList = await agent.run(max_steps=10)
actions_names = history.action_names()
actions = history.model_actions()
assert 'go_to_url' in actions_names or 'open_tab' in actions_names, f'{actions_names} does not contain go_to_url or open_tab'
for action in actions:
if 'go_to_url' in action:
assert 'url' in action['go_to_url'], 'url is not in go_to_url'
assert action['go_to_url']['url'].endswith('google.com'), 'url does not end with google.com'
break
# @pytest.mark.asyncio
async def test_find_contact_email(llm, context):
"""Test agent's ability to find contact email on a website"""
agent = Agent(
task='Go to https://browser-use.com/ and find out the contact email',
llm=llm,
browser_context=context,
)
history: AgentHistoryList = await agent.run(max_steps=10)
# Verify the agent found the contact email
extracted_content = history.extracted_content()
email = 'info@browser-use.com'
for content in extracted_content:
if email in content:
break
else:
pytest.fail(f'{extracted_content} does not contain {email}')
# @pytest.mark.asyncio
async def test_agent_finds_installation_command(llm, context):
"""Test agent's ability to find the pip installation command for browser-use on the web"""
agent = Agent(
task='Find the pip installation command for the browser-use repo',
llm=llm,
browser_context=context,
)
history: AgentHistoryList = await agent.run(max_steps=10)
# Verify the agent found the correct installation command
extracted_content = history.extracted_content()
install_command = 'pip install browser-use'
for content in extracted_content:
if install_command in content:
break
else:
pytest.fail(f'{extracted_content} does not contain {install_command}')
class CaptchaTest(BaseModel):
name: str
url: str
success_text: str
additional_text: str | None = None
# run 3 test: python -m pytest tests/test_agent_actions.py -v -k "test_captcha_solver" --capture=no --log-cli-level=INFO
# pytest tests/test_agent_actions.py -v -k "test_captcha_solver" --capture=no --log-cli-level=INFO
@pytest.mark.asyncio
@pytest.mark.parametrize(
'captcha',
[
CaptchaTest(
name='Text Captcha',
url='https://2captcha.com/demo/text',
success_text='Captcha is passed successfully!',
),
CaptchaTest(
name='Basic Captcha',
url='https://captcha.com/demos/features/captcha-demo.aspx',
success_text='Correct!',
),
CaptchaTest(
name='Rotate Captcha',
url='https://2captcha.com/demo/rotatecaptcha',
success_text='Captcha is passed successfully',
additional_text='Use multiple clicks at once. click done when image is exact correct position.',
),
CaptchaTest(
name='MT Captcha',
url='https://2captcha.com/demo/mtcaptcha',
success_text='Verified Successfully',
additional_text='Stop when you solved it successfully.',
),
],
)
async def test_captcha_solver(llm, context, captcha: CaptchaTest):
"""Test agent's ability to solve different types of captchas"""
agent = Agent(
task=f'Go to {captcha.url} and solve the captcha. {captcha.additional_text}',
llm=llm,
browser_context=context,
)
from browser_use.agent.views import AgentHistoryList
history: AgentHistoryList = await agent.run(max_steps=7)
state: BrowserState = await context.get_state()
all_text = state.element_tree.get_all_text_till_next_clickable_element()
if not all_text:
all_text = ''
if not isinstance(all_text, str):
all_text = str(all_text)
solved = captcha.success_text in all_text
assert solved, f'Failed to solve {captcha.name}'
# python -m pytest tests/test_agent_actions.py -v --capture=no
# pytest tests/test_agent_actions.py -v -k "test_captcha_solver" --capture=no --log-cli-level=INFO

View file

@ -0,0 +1,69 @@
import asyncio
from playwright.async_api import async_playwright
async def test_full_screen(start_fullscreen: bool, maximize: bool):
async with async_playwright() as p:
try:
print('Attempting to connect to Chrome...')
# run in terminal: /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --remote-debugging-port=9222 --no-first-run
browser = await p.chromium.connect_over_cdp(
'http://localhost:9222',
timeout=20000, # 20 second timeout for connection
)
print('Connected to Chrome successfully')
# Get the first context and page, or create new ones if needed
if len(browser.contexts) == 0:
context = await browser.new_context(ignore_https_errors=True)
else:
context = browser.contexts[0]
if len(context.pages) == 0:
page = await context.new_page()
else:
page = context.pages[0]
print('Attempting to navigate to Gmail...')
try:
# First try with a shorter timeout
await page.goto(
'https://mail.google.com',
wait_until='load', # Changed from domcontentloaded
timeout=10000,
)
except Exception as e:
print(f'First navigation attempt failed: {e}')
print('Trying again with different settings...')
# If that fails, try again with different settings
await page.goto(
'https://mail.google.com',
wait_until='commit', # Less strict wait condition
timeout=30000,
)
# Wait for the page to stabilize
await asyncio.sleep(2)
print(f'Current page title: {await page.title()}')
# Optional: wait for specific Gmail elements
try:
await page.wait_for_selector('div[role="main"]', timeout=5000)
print('Gmail interface detected')
except Exception as e:
print(f'Note: Gmail interface not detected: {e}')
await asyncio.sleep(30)
except Exception as e:
print(f'An error occurred: {e}')
import traceback
traceback.print_exc()
finally:
await browser.close()
if __name__ == '__main__':
asyncio.run(test_full_screen(False, False))

View file

@ -0,0 +1,578 @@
import asyncio
import subprocess
import psutil
import pytest
import requests
from browser_use.browser.browser import Browser, BrowserConfig, ProxySettings
from browser_use.browser.context import BrowserContext, BrowserContextConfig
@pytest.mark.asyncio
async def test_builtin_browser_launch(monkeypatch):
"""
Test that the standard browser is launched correctly:
When no remote (cdp or wss) or chrome instance is provided, the Browser class uses _setup_builtin_browser.
This test monkeypatches async_playwright to return dummy objects, and asserts that get_playwright_browser returns the expected DummyBrowser.
"""
class DummyBrowser:
pass
class DummyChromium:
async def launch(self, headless, args, proxy=None, handle_sigterm=False, handle_sigint=False):
return DummyBrowser()
class DummyPlaywright:
def __init__(self):
self.chromium = DummyChromium()
async def stop(self):
pass
class DummyAsyncPlaywrightContext:
async def start(self):
return DummyPlaywright()
monkeypatch.setattr('browser_use.browser.browser.async_playwright', lambda: DummyAsyncPlaywrightContext())
config = BrowserConfig(headless=True, disable_security=False, extra_browser_args=['--test'])
browser_obj = Browser(config=config)
result_browser = await browser_obj.get_playwright_browser()
assert isinstance(result_browser, DummyBrowser), 'Expected DummyBrowser from _setup_builtin_browser'
await browser_obj.close()
@pytest.mark.asyncio
async def test_cdp_browser_launch(monkeypatch):
"""
Test that when a CDP URL is provided in the configuration, the Browser uses _setup_cdp
and returns the expected DummyBrowser.
"""
class DummyBrowser:
pass
class DummyChromium:
async def connect_over_cdp(self, endpoint_url, timeout=20000):
assert endpoint_url == 'ws://dummy-cdp-url', 'The endpoint URL should match the configuration.'
return DummyBrowser()
class DummyPlaywright:
def __init__(self):
self.chromium = DummyChromium()
async def stop(self):
pass
class DummyAsyncPlaywrightContext:
async def start(self):
return DummyPlaywright()
monkeypatch.setattr('browser_use.browser.browser.async_playwright', lambda: DummyAsyncPlaywrightContext())
config = BrowserConfig(cdp_url='ws://dummy-cdp-url')
browser_obj = Browser(config=config)
result_browser = await browser_obj.get_playwright_browser()
assert isinstance(result_browser, DummyBrowser), 'Expected DummyBrowser from _setup_cdp'
await browser_obj.close()
@pytest.mark.asyncio
async def test_wss_browser_launch(monkeypatch):
"""
Test that when a WSS URL is provided in the configuration,
the Browser uses setup_wss and returns the expected DummyBrowser.
"""
class DummyBrowser:
pass
class DummyChromium:
async def connect(self, wss_url):
assert wss_url == 'ws://dummy-wss-url', 'WSS URL should match the configuration.'
return DummyBrowser()
class DummyPlaywright:
def __init__(self):
self.chromium = DummyChromium()
async def stop(self):
pass
class DummyAsyncPlaywrightContext:
async def start(self):
return DummyPlaywright()
monkeypatch.setattr('browser_use.browser.browser.async_playwright', lambda: DummyAsyncPlaywrightContext())
config = BrowserConfig(wss_url='ws://dummy-wss-url')
browser_obj = Browser(config=config)
result_browser = await browser_obj.get_playwright_browser()
assert isinstance(result_browser, DummyBrowser), 'Expected DummyBrowser from _setup_wss'
await browser_obj.close()
@pytest.mark.asyncio
async def test_user_provided_browser_launch(monkeypatch):
"""
Test that when a browser_binary_path is provided the Browser class uses
_setup_user_provided_browser branch and returns the expected DummyBrowser object
by reusing an existing Chrome instance.
"""
# Dummy response for requests.get when checking chrome debugging endpoint.
class DummyResponse:
status_code = 200
def dummy_get(url, timeout):
if url == 'http://localhost:9222/json/version':
return DummyResponse()
raise requests.ConnectionError('Connection failed')
monkeypatch.setattr(requests, 'get', dummy_get)
class DummyBrowser:
pass
class DummyChromium:
async def connect_over_cdp(self, endpoint_url, timeout=20000):
assert endpoint_url == 'http://localhost:9222', "Endpoint URL must be 'http://localhost:9222'"
return DummyBrowser()
class DummyPlaywright:
def __init__(self):
self.chromium = DummyChromium()
async def stop(self):
pass
class DummyAsyncPlaywrightContext:
async def start(self):
return DummyPlaywright()
monkeypatch.setattr('browser_use.browser.browser.async_playwright', lambda: DummyAsyncPlaywrightContext())
config = BrowserConfig(browser_binary_path='dummy/chrome', extra_browser_args=['--dummy-arg'])
browser_obj = Browser(config=config)
result_browser = await browser_obj.get_playwright_browser()
assert isinstance(result_browser, DummyBrowser), 'Expected DummyBrowser from _setup_user_provided_browser'
await browser_obj.close()
@pytest.mark.asyncio
async def test_user_provided_browser_launch_on_custom_chrome_remote_debugging_port(monkeypatch):
"""
Test that when a browser_binary_path and chrome_remote_debugging_port are provided, the Browser class uses
_setup_user_provided_browser branch and returns the expected DummyBrowser object
by launching a new Chrome instance with --remote-debugging-port=chrome_remote_debugging_port argument.
"""
# Custom remote debugging port
custom_chrome_remote_debugging_port = 9223
# Dummy response for requests.get when checking chrome debugging endpoint.
class DummyResponse:
status_code = 200
def dummy_get(url, timeout):
if url == f'http://localhost:{custom_chrome_remote_debugging_port}/json/version':
return DummyResponse()
raise requests.ConnectionError('Connection failed')
monkeypatch.setattr(requests, 'get', dummy_get)
class DummyProcess:
def __init__(self, *args, **kwargs):
pass
class DummySubProcess:
pid = 1234
async def dummy_create_subprocess_exec(browser_binary_path, *args, **kwargs):
assert f'--remote-debugging-port={custom_chrome_remote_debugging_port}' in args, (
f'Chrome must be started with with --remote-debugging-port={custom_chrome_remote_debugging_port} argument'
)
return DummySubProcess()
monkeypatch.setattr(asyncio, 'create_subprocess_exec', dummy_create_subprocess_exec)
monkeypatch.setattr(psutil, 'Process', DummyProcess)
class DummyBrowser:
pass
class DummyChromium:
async def connect_over_cdp(self, endpoint_url, timeout=20000):
assert endpoint_url == f'http://localhost:{custom_chrome_remote_debugging_port}', (
f"Endpoint URL must be 'http://localhost:{custom_chrome_remote_debugging_port}'"
)
return DummyBrowser()
class DummyPlaywright:
def __init__(self):
self.chromium = DummyChromium()
async def stop(self):
pass
class DummyAsyncPlaywrightContext:
async def start(self):
return DummyPlaywright()
monkeypatch.setattr('browser_use.browser.browser.async_playwright', lambda: DummyAsyncPlaywrightContext())
config = BrowserConfig(
browser_binary_path='dummy/chrome',
chrome_remote_debugging_port=custom_chrome_remote_debugging_port,
extra_browser_args=['--dummy-arg'],
)
browser_obj = Browser(config=config)
result_browser = await browser_obj.get_playwright_browser()
assert isinstance(result_browser, DummyBrowser), (
f'Expected DummyBrowser with remote debugging port {custom_chrome_remote_debugging_port} from _setup_user_provided_browser'
)
await browser_obj.close()
@pytest.mark.asyncio
async def test_builtin_browser_disable_security_args(monkeypatch):
"""
Test that the standard browser launch includes disable-security arguments when disable_security is True.
This verifies that _setup_builtin_browser correctly appends the security disabling arguments along with
the base arguments and any extra arguments provided.
"""
# These are the base arguments defined in _setup_builtin_browser.
base_args = [
'--no-sandbox',
'--disable-blink-features=AutomationControlled',
'--disable-infobars',
'--disable-background-timer-throttling',
'--disable-popup-blocking',
'--disable-backgrounding-occluded-windows',
'--disable-renderer-backgrounding',
'--disable-window-activation',
'--disable-focus-on-load',
'--no-first-run',
'--no-default-browser-check',
'--no-startup-window',
'--window-position=0,0',
]
# When disable_security is True, these arguments should be added.
disable_security_args = [
'--disable-web-security',
'--disable-site-isolation-trials',
'--disable-features=IsolateOrigins,site-per-process',
]
# Additional arbitrary argument for testing extra args
extra_args = ['--dummy-extra']
class DummyBrowser:
pass
class DummyChromium:
async def launch(self, headless, args, proxy=None, handle_sigterm=False, handle_sigint=False):
# Expected args is the base args plus disable security args and the extra args.
expected_args = base_args + disable_security_args + extra_args
assert headless is True, 'Expected headless to be True'
assert args == expected_args, f'Expected args {expected_args}, but got {args}'
assert proxy is None, 'Expected proxy to be None'
return DummyBrowser()
class DummyPlaywright:
def __init__(self):
self.chromium = DummyChromium()
async def stop(self):
pass
class DummyAsyncPlaywrightContext:
async def start(self):
return DummyPlaywright()
monkeypatch.setattr('browser_use.browser.browser.async_playwright', lambda: DummyAsyncPlaywrightContext())
config = BrowserConfig(headless=True, disable_security=True, extra_browser_args=extra_args)
browser_obj = Browser(config=config)
result_browser = await browser_obj.get_playwright_browser()
assert isinstance(result_browser, DummyBrowser), (
'Expected DummyBrowser from _setup_builtin_browser with disable_security active'
)
await browser_obj.close()
@pytest.mark.asyncio
async def test_new_context_creation():
"""
Test that the new_context method returns a BrowserContext with the correct attributes.
This verifies that the BrowserContext is initialized with the provided Browser instance and configuration.
"""
config = BrowserConfig()
browser_obj = Browser(config=config)
custom_context_config = BrowserContextConfig()
context = await browser_obj.new_context(custom_context_config)
assert isinstance(context, BrowserContext), 'Expected new_context to return an instance of BrowserContext'
assert context.browser is browser_obj, "Expected the context's browser attribute to be the Browser instance"
assert context.config == custom_context_config, "Expected the context's config attribute to be the provided config"
await browser_obj.close()
@pytest.mark.asyncio
async def test_user_provided_browser_launch_failure(monkeypatch):
"""
Test that when a Chrome instance cannot be started or connected to,
the Browser._setup_user_provided_browser branch eventually raises a RuntimeError.
We simulate failure by:
- Forcing requests.get to always raise a ConnectionError (so no existing instance is found).
- Monkeypatching subprocess.Popen to do nothing.
- Replacing asyncio.sleep to avoid delays.
- Having the dummy playwright's connect_over_cdp method always raise an Exception.
"""
def dummy_get(url, timeout):
raise requests.ConnectionError('Simulated connection failure')
monkeypatch.setattr(requests, 'get', dummy_get)
monkeypatch.setattr(subprocess, 'Popen', lambda args, stdout, stderr: None)
async def fake_sleep(seconds):
return
monkeypatch.setattr(asyncio, 'sleep', fake_sleep)
class DummyChromium:
async def connect_over_cdp(self, endpoint_url, timeout=20000):
raise Exception('Connection failed simulation')
class DummyPlaywright:
def __init__(self):
self.chromium = DummyChromium()
async def stop(self):
pass
class DummyAsyncPlaywrightContext:
async def start(self):
return DummyPlaywright()
monkeypatch.setattr('browser_use.browser.browser.async_playwright', lambda: DummyAsyncPlaywrightContext())
config = BrowserConfig(browser_binary_path='dummy/chrome', extra_browser_args=['--dummy-arg'])
browser_obj = Browser(config=config)
with pytest.raises(RuntimeError, match='To start chrome in Debug mode'):
await browser_obj.get_playwright_browser()
await browser_obj.close()
@pytest.mark.asyncio
async def test_get_playwright_browser_caching(monkeypatch):
"""
Test that get_playwright_browser returns a cached browser instance.
On the first call, the browser is initialized; on subsequent calls,
the same instance is returned.
"""
class DummyBrowser:
pass
class DummyChromium:
async def launch(self, headless, args, proxy=None, handle_sigterm=False, handle_sigint=False):
return DummyBrowser()
class DummyPlaywright:
def __init__(self):
self.chromium = DummyChromium()
async def stop(self):
pass
class DummyAsyncPlaywrightContext:
async def start(self):
return DummyPlaywright()
monkeypatch.setattr('browser_use.browser.browser.async_playwright', lambda: DummyAsyncPlaywrightContext())
config = BrowserConfig(headless=True, disable_security=False, extra_browser_args=['--test'])
browser_obj = Browser(config=config)
first_browser = await browser_obj.get_playwright_browser()
second_browser = await browser_obj.get_playwright_browser()
assert first_browser is second_browser, 'Expected the browser to be cached and reused across calls.'
await browser_obj.close()
@pytest.mark.asyncio
async def test_close_error_handling(monkeypatch):
"""
Test that the close method properly handles exceptions thrown by
playwright_browser.close() and playwright.stop(), ensuring that the
browser's attributes are set to None even if errors occur.
"""
class DummyBrowserWithError:
async def close(self):
raise Exception('Close error simulation')
class DummyPlaywrightWithError:
async def stop(self):
raise Exception('Stop error simulation')
config = BrowserConfig()
browser_obj = Browser(config=config)
browser_obj.playwright_browser = DummyBrowserWithError()
browser_obj.playwright = DummyPlaywrightWithError()
await browser_obj.close()
assert browser_obj.playwright_browser is None, 'Expected playwright_browser to be None after close'
assert browser_obj.playwright is None, 'Expected playwright to be None after close'
@pytest.mark.asyncio
async def test_standard_browser_launch_with_proxy(monkeypatch):
"""
Test that when a proxy is provided in the BrowserConfig, the _setup_builtin_browser method
correctly passes the proxy parameter to the playwright.chromium.launch method.
This test sets up a dummy async_playwright context and verifies that the dummy proxy is received.
"""
class DummyBrowser:
pass
# Create a dummy proxy settings instance.
dummy_proxy = ProxySettings(server='http://dummy.proxy')
class DummyChromium:
async def launch(self, headless, args, proxy=None, handle_sigterm=False, handle_sigint=False):
# Assert that the proxy passed equals the dummy proxy provided in the configuration.
assert isinstance(proxy, dict) and proxy['server'] == 'http://dummy.proxy', (
f'Expected proxy {dummy_proxy} but got {proxy}'
)
# We can also verify some base parameters if needed (headless, args) but our focus is proxy.
return DummyBrowser()
class DummyPlaywright:
def __init__(self):
self.chromium = DummyChromium()
async def stop(self):
pass
class DummyAsyncPlaywrightContext:
async def start(self):
return DummyPlaywright()
# Monkeypatch async_playwright to return our dummy async playwright context.
monkeypatch.setattr('browser_use.browser.browser.async_playwright', lambda: DummyAsyncPlaywrightContext())
# Create a BrowserConfig with the dummy proxy.
config = BrowserConfig(headless=False, disable_security=False, proxy=dummy_proxy)
browser_obj = Browser(config=config)
# Call get_playwright_browser and verify that the returned browser is as expected.
result_browser = await browser_obj.get_playwright_browser()
assert isinstance(result_browser, DummyBrowser), 'Expected DummyBrowser from _setup_builtin_browser with proxy provided'
await browser_obj.close()
@pytest.mark.asyncio
async def test_browser_window_size(monkeypatch):
"""
Test that when window_width and window_height are provided in BrowserContextConfig,
they're properly converted to a dictionary when passed to Playwright.
"""
class DummyPage:
def __init__(self):
self.url = 'about:blank'
async def goto(self, url):
pass
async def wait_for_load_state(self, state):
pass
async def title(self):
return 'Test Page'
async def bring_to_front(self):
pass
async def evaluate(self, script):
return True
def is_closed(self):
return False
class DummyContext:
def __init__(self):
self.pages = [DummyPage()]
self.tracing = self
async def new_page(self):
return DummyPage()
async def add_init_script(self, script):
pass
async def start(self):
pass
async def stop(self, path=None):
pass
def on(self, event, handler):
pass
async def close(self):
pass
async def grant_permissions(self, permissions, origin=None):
pass
class DummyBrowser:
def __init__(self):
self.contexts = []
async def new_context(self, **kwargs):
# Assert that record_video_size is a dictionary with expected values
assert isinstance(kwargs['record_video_size'], dict), (
f'Expected record_video_size to be a dictionary, got {type(kwargs["record_video_size"])}'
)
assert kwargs['record_video_size']['width'] == 1280, (
f'Expected width to be 1280, got {kwargs["record_video_size"].get("width")}'
)
assert kwargs['record_video_size']['height'] == 1100, (
f'Expected height to be 1100, got {kwargs["record_video_size"].get("height")}'
)
context = DummyContext()
self.contexts.append(context)
return context
async def close(self):
pass
class DummyPlaywright:
def __init__(self):
self.chromium = self
async def launch(self, **kwargs):
return DummyBrowser()
async def stop(self):
pass
class DummyAsyncPlaywrightContext:
async def start(self):
return DummyPlaywright()
# Monkeypatch async_playwright to return our dummy async playwright context
monkeypatch.setattr('browser_use.browser.browser.async_playwright', lambda: DummyAsyncPlaywrightContext())
# Create browser with default config
browser_obj = Browser()
# Get browser instance
playwright_browser = await browser_obj.get_playwright_browser()
# Create context config with specific window size
context_config = BrowserContextConfig(window_width=1280, window_height=1100)
# Create browser context - this will test if window dimensions are properly converted
browser_context = BrowserContext(browser=browser_obj, config=context_config)
await browser_context._initialize_session()
# Clean up
await browser_context.close()
await browser_obj.close()

View file

@ -0,0 +1,201 @@
import os
import pytest
from browser_use.browser.browser import Browser, BrowserConfig, ProxySettings
from browser_use.browser.context import BrowserContext, BrowserContextConfig
@pytest.mark.asyncio
async def test_proxy_settings_pydantic_model():
"""
Test that ProxySettings as a Pydantic model is correctly converted to a dictionary when used.
"""
# Create ProxySettings with Pydantic model
proxy_settings = ProxySettings(
server='http://example.proxy:8080', bypass='localhost', username='testuser', password='testpass'
)
# Verify the model has correct dict-like access
assert proxy_settings['server'] == 'http://example.proxy:8080'
assert proxy_settings.get('bypass') == 'localhost'
assert proxy_settings.get('nonexistent', 'default') == 'default'
# Verify model_dump works correctly
proxy_dict = proxy_settings.model_dump()
assert isinstance(proxy_dict, dict)
assert proxy_dict['server'] == 'http://example.proxy:8080'
assert proxy_dict['bypass'] == 'localhost'
assert proxy_dict['username'] == 'testuser'
assert proxy_dict['password'] == 'testpass'
# We don't launch the actual browser - we just verify the model itself works as expected
@pytest.mark.asyncio
async def test_window_size_config():
"""
Test that BrowserContextConfig correctly handles window_width and window_height properties.
"""
# Create config with specific window dimensions
config = BrowserContextConfig(window_width=1280, window_height=1100)
# Verify the properties are set correctly
assert config.window_width == 1280
assert config.window_height == 1100
# Verify model_dump works correctly
config_dict = config.model_dump()
assert isinstance(config_dict, dict)
assert config_dict['window_width'] == 1280
assert config_dict['window_height'] == 1100
# Create with different values
config2 = BrowserContextConfig(window_width=1920, window_height=1080)
assert config2.window_width == 1920
assert config2.window_height == 1080
@pytest.mark.asyncio
@pytest.mark.skipif(os.environ.get('CI') == 'true', reason='Skip browser test in CI')
async def test_window_size_with_real_browser():
"""
Integration test that verifies our window size Pydantic model is correctly
passed to Playwright and the actual browser window is configured with these settings.
This test is skipped in CI environments.
"""
# Create browser config with headless mode
browser_config = BrowserConfig(
headless=True, # Use headless for faster test
)
# Create context config with specific dimensions we can check
context_config = BrowserContextConfig(
window_width=1024,
window_height=768,
maximum_wait_page_load_time=2.0, # Faster timeouts for test
minimum_wait_page_load_time=0.2,
no_viewport=True, # Use actual window size instead of viewport
)
# Create browser and context
browser = Browser(config=browser_config)
try:
# Initialize browser
playwright_browser = await browser.get_playwright_browser()
assert playwright_browser is not None, 'Browser initialization failed'
# Create context
browser_context = BrowserContext(browser=browser, config=context_config)
try:
# Initialize session
await browser_context._initialize_session()
# Get the current page
page = await browser_context.get_current_page()
assert page is not None, 'Failed to get current page'
# Get the context configuration used for browser window size
video_size = await page.evaluate("""
() => {
// This returns information about the context recording settings
// which should match our configured video size (browser_window_size)
try {
const settings = window.getPlaywrightContextSettings ?
window.getPlaywrightContextSettings() : null;
if (settings && settings.recordVideo) {
return settings.recordVideo.size;
}
} catch (e) {}
// Fallback to window dimensions
return {
width: window.innerWidth,
height: window.innerHeight
};
}
""")
# Let's also check the viewport size
viewport_size = await page.evaluate("""
() => {
return {
width: window.innerWidth,
height: window.innerHeight
}
}
""")
print(f'Window size config: width={context_config.window_width}, height={context_config.window_height}')
print(f'Browser viewport size: {viewport_size}')
# This is a lightweight test to verify that the page has a size (details may vary by browser)
assert viewport_size['width'] > 0, 'Expected viewport width to be positive'
assert viewport_size['height'] > 0, 'Expected viewport height to be positive'
# For browser context creation in record_video_size, this is what truly matters
# Verify that our window size was properly serialized to a dictionary
print(f'Content of context session: {browser_context.session.context}')
print('✅ Browser window size used in the test')
finally:
# Clean up context
await browser_context.close()
finally:
# Clean up browser
await browser.close()
@pytest.mark.asyncio
async def test_proxy_with_real_browser():
"""
Integration test that verifies our proxy Pydantic model is correctly
passed to Playwright without requiring a working proxy server.
This test:
1. Creates a ProxySettings Pydantic model
2. Passes it to BrowserConfig
3. Verifies browser initialization works (proving the model was correctly serialized)
4. We don't actually verify proxy functionality (would require a working proxy)
"""
# Create proxy settings with a fake proxy server
proxy_settings = ProxySettings(
server='http://non.existent.proxy:9999', bypass='localhost', username='testuser', password='testpass'
)
# Test model serialization
proxy_dict = proxy_settings.model_dump()
assert isinstance(proxy_dict, dict)
assert proxy_dict['server'] == 'http://non.existent.proxy:9999'
# Create browser config with proxy
browser_config = BrowserConfig(
headless=True,
proxy=proxy_settings,
)
# Create browser
browser = Browser(config=browser_config)
try:
# Initialize browser - this should succeed even with invalid proxy
# because we're just checking configuration, not actual proxy functionality
try:
playwright_browser = await browser.get_playwright_browser()
assert playwright_browser is not None, 'Browser initialization failed'
# Success - the browser was initialized with our proxy settings
# We won't try to make requests (which would fail with non-existent proxy)
print('✅ Browser initialized with proxy settings successfully')
# We can inspect browser settings here to verify proxy was passed
# but the specific API to access these settings depends on the browser
except Exception as e:
# Make sure any exception isn't related to the proxy configuration format
# (Network errors due to non-existent proxy are acceptable, invalid type conversion isn't)
error_text = str(e).lower()
assert 'proxy' not in error_text or any(
term in error_text for term in ['connect', 'connection', 'network', 'timeout', 'unreachable']
), f'Proxy configuration error (not network error): {e}'
finally:
# Clean up browser
await browser.close()

View file

@ -0,0 +1,107 @@
"""
Example script demonstrating the browser_window_size feature.
This script shows how to set a custom window size for the browser.
"""
import asyncio
import sys
from typing import Any
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import BrowserContextConfig
async def main():
"""Demonstrate setting a custom browser window size"""
# Create a browser with a specific window size
config = BrowserContextConfig(window_width=800, window_height=400) # Small size to clearly demonstrate the fix
browser = None
browser_context = None
try:
# Initialize the browser with error handling
try:
browser = Browser(
config=BrowserConfig(
headless=False, # Use non-headless mode to see the window
)
)
except Exception as e:
print(f'Failed to initialize browser: {e}')
return 1
# Create a browser context
try:
browser_context = await browser.new_context(config=config)
except Exception as e:
print(f'Failed to create browser context: {e}')
return 1
# Get the current page
page = await browser_context.get_current_page()
# Navigate to a test page with error handling
try:
await page.goto('https://example.com')
await page.wait_for_load_state('domcontentloaded')
except Exception as e:
print(f'Failed to navigate to example.com: {e}')
print('Continuing with test anyway...')
# Wait a bit to see the window
await asyncio.sleep(2)
# Get the actual viewport size using JavaScript
viewport_size = await page.evaluate("""
() => {
return {
width: window.innerWidth,
height: window.innerHeight
}
}
""")
print(f'Configured window size: width={config.window_width}, height={config.window_height}')
print(f'Actual viewport size: {viewport_size}')
# Validate the window size
validate_window_size({'width': config.window_width, 'height': config.window_height}, viewport_size)
# Wait a bit more to see the window
await asyncio.sleep(3)
return 0
except Exception as e:
print(f'Unexpected error: {e}')
return 1
finally:
# Close resources
if browser_context:
await browser_context.close()
if browser:
await browser.close()
def validate_window_size(configured: dict[str, Any], actual: dict[str, Any]) -> None:
"""Compare configured window size with actual size and report differences"""
# Allow for small differences due to browser chrome, scrollbars, etc.
width_diff = abs(configured['width'] - actual['width'])
height_diff = abs(configured['height'] - actual['height'])
# Tolerance of 5% or 20px, whichever is greater
width_tolerance = max(configured['width'] * 0.05, 20)
height_tolerance = max(configured['height'] * 0.05, 20)
if width_diff > width_tolerance or height_diff > height_tolerance:
print('WARNING: Significant difference between configured and actual window size!')
print(f'Width difference: {width_diff}px, Height difference: {height_diff}px')
else:
print('Window size validation passed: actual size matches configured size within tolerance')
if __name__ == '__main__':
result = asyncio.run(main())
sys.exit(result)

View file

@ -0,0 +1,33 @@
import asyncio
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import BrowserContextConfig
async def test():
print('Testing browser window sizing with no_viewport=False...')
browser = Browser(BrowserConfig(headless=False))
context_config = BrowserContextConfig(window_width=1440, window_height=900, no_viewport=False)
browser_context = await browser.new_context(config=context_config)
page = await browser_context.get_current_page()
await page.goto('https://example.com')
await asyncio.sleep(2)
viewport = await page.evaluate('() => ({width: window.innerWidth, height: window.innerHeight})')
print('Configured size: width=1440, height=900')
print(f'Actual viewport size: {viewport}')
# Get the actual window size
window_size = await page.evaluate("""
() => ({
width: window.outerWidth,
height: window.outerHeight
})
""")
print(f'Actual window size: {window_size}')
await browser_context.close()
await browser.close()
if __name__ == '__main__':
asyncio.run(test())

View file

@ -0,0 +1,363 @@
import base64
from unittest.mock import Mock
import pytest
from browser_use.browser.context import BrowserContext, BrowserContextConfig
from browser_use.browser.views import BrowserState
from browser_use.dom.views import DOMElementNode
def test_is_url_allowed():
"""
Test the _is_url_allowed method to verify that it correctly checks URLs against
the allowed domains configuration.
Scenario 1: When allowed_domains is None, all URLs should be allowed.
Scenario 2: When allowed_domains is a list, only URLs matching the allowed domain(s) are allowed.
Scenario 3: When the URL is malformed, it should return False.
Scenario 4: When allowed_domains contain glob patterns, see: test_url_allowlist_security.py
"""
# Create a dummy Browser mock. Only the 'config' attribute is needed for _is_url_allowed.
dummy_browser = Mock()
# Set an empty config for dummy_browser; it won't be used in _is_url_allowed.
dummy_browser.config = Mock()
# Scenario 1: allowed_domains is None, any URL should be allowed.
config1 = BrowserContextConfig(allowed_domains=None)
context1 = BrowserContext(browser=dummy_browser, config=config1)
assert context1._is_url_allowed('http://anydomain.com') is True
assert context1._is_url_allowed('https://anotherdomain.org/path') is True
# Scenario 2: allowed_domains is provided.
allowed = ['example.com', 'mysite.org']
config2 = BrowserContextConfig(allowed_domains=allowed)
context2 = BrowserContext(browser=dummy_browser, config=config2)
# URL exactly matching
assert context2._is_url_allowed('http://example.com') is True
# URL with subdomain (should be allowed)
assert context2._is_url_allowed('http://sub.example.com/path') is True
# URL with different domain (should not be allowed)
assert context2._is_url_allowed('http://notexample.com') is False
# URL that matches second allowed domain
assert context2._is_url_allowed('https://mysite.org/page') is True
# URL with port number, still allowed (port is stripped)
assert context2._is_url_allowed('http://example.com:8080') is True
# Scenario 3: Malformed URL or empty domain
# urlparse will return an empty netloc for some malformed URLs.
assert context2._is_url_allowed('notaurl') is False
def test_convert_simple_xpath_to_css_selector():
"""
Test the _convert_simple_xpath_to_css_selector method of BrowserContext.
This verifies that simple XPath expressions (with and without indices) are correctly converted to CSS selectors.
"""
# Test empty xpath returns empty string
assert BrowserContext._convert_simple_xpath_to_css_selector('') == ''
# Test a simple xpath without indices
xpath = '/html/body/div/span'
expected = 'html > body > div > span'
result = BrowserContext._convert_simple_xpath_to_css_selector(xpath)
assert result == expected
# Test xpath with an index on one element: [2] should translate to :nth-of-type(2)
xpath = '/html/body/div[2]/span'
expected = 'html > body > div:nth-of-type(2) > span'
result = BrowserContext._convert_simple_xpath_to_css_selector(xpath)
assert result == expected
# Test xpath with indices on multiple elements:
# For "li[3]" -> li:nth-of-type(3) and for "a[1]" -> a:nth-of-type(1)
xpath = '/ul/li[3]/a[1]'
expected = 'ul > li:nth-of-type(3) > a:nth-of-type(1)'
result = BrowserContext._convert_simple_xpath_to_css_selector(xpath)
assert result == expected
def test_get_initial_state():
"""
Test the _get_initial_state method to verify it returns the correct initial BrowserState.
The test checks that when a dummy page with a URL is provided,
the returned state contains that URL and other default values.
"""
# Create a dummy browser since only its existence is needed.
dummy_browser = Mock()
dummy_browser.config = Mock()
context = BrowserContext(browser=dummy_browser, config=BrowserContextConfig())
# Define a dummy page with a 'url' attribute.
class DummyPage:
url = 'http://dummy.com'
dummy_page = DummyPage()
# Call _get_initial_state with a page: URL should be set from page.url.
state_with_page = context._get_initial_state(page=dummy_page)
assert state_with_page.url == dummy_page.url
# Verify that the element_tree is initialized with tag 'root'
assert state_with_page.element_tree.tag_name == 'root'
# Call _get_initial_state without a page: URL should be empty.
state_without_page = context._get_initial_state()
assert state_without_page.url == ''
@pytest.mark.asyncio
async def test_execute_javascript():
"""
Test the execute_javascript method by mocking the current page's evaluate function.
This ensures that when execute_javascript is called, it correctly returns the value
from the page's evaluate method.
"""
# Define a dummy page with an async evaluate method.
class DummyPage:
async def evaluate(self, script):
return 'dummy_result'
# Create a dummy session object with a dummy current_page.
dummy_session = type('DummySession', (), {})()
dummy_session.current_page = DummyPage()
# Create a dummy browser mock with a minimal config.
dummy_browser = Mock()
dummy_browser.config = Mock()
# Initialize the BrowserContext with the dummy browser and config.
context = BrowserContext(browser=dummy_browser, config=BrowserContextConfig())
# Manually set the session to our dummy session.
context.session = dummy_session
# Call execute_javascript and verify it returns the expected result.
result = await context.execute_javascript('return 1+1')
assert result == 'dummy_result'
@pytest.mark.asyncio
async def test_enhanced_css_selector_for_element():
"""
Test the _enhanced_css_selector_for_element method to verify that
it returns the correct CSS selector string for a dummy DOMElementNode.
The test checks that:
- The provided xpath is correctly converted (handling indices),
- Class attributes are appended as CSS classes,
- Standard and dynamic attributes (including ones with special characters)
are correctly added to the selector.
"""
# Create a dummy DOMElementNode instance with a complex set of attributes.
dummy_element = DOMElementNode(
tag_name='div',
is_visible=True,
parent=None,
xpath='/html/body/div[2]',
attributes={'class': 'foo bar', 'id': 'my-id', 'placeholder': 'some "quoted" text', 'data-testid': '123'},
children=[],
)
# Call the method with include_dynamic_attributes=True.
actual_selector = BrowserContext._enhanced_css_selector_for_element(dummy_element, include_dynamic_attributes=True)
# Expected conversion:
# 1. The xpath "/html/body/div[2]" converts to "html > body > div:nth-of-type(2)".
# 2. The class attribute "foo bar" appends ".foo.bar".
# 3. The "id" attribute is added as [id="my-id"].
# 4. The "placeholder" attribute contains quotes; it is added as
# [placeholder*="some \"quoted\" text"].
# 5. The dynamic attribute "data-testid" is added as [data-testid="123"].
expected_selector = (
'html > body > div:nth-of-type(2).foo.bar[id="my-id"][placeholder*="some \\"quoted\\" text"][data-testid="123"]'
)
assert actual_selector == expected_selector, f'Expected {expected_selector}, but got {actual_selector}'
@pytest.mark.asyncio
async def test_get_scroll_info():
"""
Test the get_scroll_info method by mocking the page's evaluate method.
This dummy page returns preset values for window.scrollY, window.innerHeight,
and document.documentElement.scrollHeight. The test then verifies that the
computed scroll information (pixels_above and pixels_below) match the expected values.
"""
# Define a dummy page with an async evaluate method returning preset values.
class DummyPage:
async def evaluate(self, script):
if 'window.scrollY' in script:
return 100 # scrollY
elif 'window.innerHeight' in script:
return 500 # innerHeight
elif 'document.documentElement.scrollHeight' in script:
return 1200 # total scrollable height
return None
# Create a dummy session with a dummy current_page.
dummy_session = type('DummySession', (), {})()
dummy_session.current_page = DummyPage()
# We also need a dummy context attribute but it won't be used in this test.
dummy_session.context = type('DummyContext', (), {})()
# Create a dummy browser mock.
dummy_browser = Mock()
dummy_browser.config = Mock()
# Initialize BrowserContext with the dummy browser and config.
context = BrowserContext(browser=dummy_browser, config=BrowserContextConfig())
# Manually set the session to our dummy session.
context.session = dummy_session
# Call get_scroll_info on the dummy page.
pixels_above, pixels_below = await context.get_scroll_info(dummy_session.current_page)
# Expected calculations:
# pixels_above = scrollY = 100
# pixels_below = total_height - (scrollY + innerHeight) = 1200 - (100 + 500) = 600
assert pixels_above == 100, f'Expected 100 pixels above, got {pixels_above}'
assert pixels_below == 600, f'Expected 600 pixels below, got {pixels_below}'
@pytest.mark.asyncio
async def test_reset_context():
"""
Test the reset_context method to ensure it correctly closes all existing tabs,
resets the cached state, and creates a new page.
"""
# Dummy Page with close and wait_for_load_state methods.
class DummyPage:
def __init__(self, url='http://dummy.com'):
self.url = url
self.closed = False
async def close(self):
self.closed = True
async def wait_for_load_state(self):
pass
# Dummy Context that holds pages and can create a new page.
class DummyContext:
def __init__(self):
self.pages = []
async def new_page(self):
new_page = DummyPage(url='')
self.pages.append(new_page)
return new_page
# Create a dummy session with a context containing two pages.
dummy_session = type('DummySession', (), {})()
dummy_context = DummyContext()
page1 = DummyPage(url='http://page1.com')
page2 = DummyPage(url='http://page2.com')
dummy_context.pages.extend([page1, page2])
dummy_session.context = dummy_context
dummy_session.current_page = page1
dummy_session.cached_state = None
# Create a dummy browser mock.
dummy_browser = Mock()
dummy_browser.config = Mock()
# Initialize BrowserContext using our dummy_browser and config,
# and manually set its session to our dummy session.
context = BrowserContext(browser=dummy_browser, config=BrowserContextConfig())
context.session = dummy_session
# Confirm session has 2 pages before reset.
assert len(dummy_session.context.pages) == 2
# Call reset_context which should close existing pages,
# reset the cached state, and create a new page as current_page.
await context.reset_context()
# Verify that initial pages were closed.
assert page1.closed is True
assert page2.closed is True
# Check that a new page is created and set as current_page.
assert dummy_session.current_page is not None
new_page = dummy_session.current_page
# New page URL should be empty as per _get_initial_state.
assert new_page.url == ''
# Verify that cached_state is reset to an initial BrowserState.
state = dummy_session.cached_state
assert isinstance(state, BrowserState)
assert state.url == ''
assert state.element_tree.tag_name == 'root'
@pytest.mark.asyncio
async def test_take_screenshot():
"""
Test the take_screenshot method to verify that it returns a base64 encoded screenshot string.
A dummy page with a mocked screenshot method is used, returning a predefined byte string.
"""
class DummyPage:
async def screenshot(self, full_page, animations):
# Verify that parameters are forwarded correctly.
assert full_page is True, 'full_page parameter was not correctly passed'
assert animations == 'disabled', 'animations parameter was not correctly passed'
# Return a test byte string.
return b'test'
# Create a dummy session with the DummyPage as the current_page.
dummy_session = type('DummySession', (), {})()
dummy_session.current_page = DummyPage()
dummy_session.context = None # Not used in this test
# Create a dummy browser mock.
dummy_browser = Mock()
dummy_browser.config = Mock()
# Initialize the BrowserContext with the dummy browser and config.
context = BrowserContext(browser=dummy_browser, config=BrowserContextConfig())
# Manually set the session to our dummy session.
context.session = dummy_session
# Call take_screenshot and check that it returns the expected base64 encoded string.
result = await context.take_screenshot(full_page=True)
expected = base64.b64encode(b'test').decode('utf-8')
assert result == expected, f'Expected {expected}, but got {result}'
@pytest.mark.asyncio
async def test_refresh_page_behavior():
"""
Test the refresh_page method of BrowserContext to verify that it correctly reloads the current page
and waits for the page's load state. This is done by creating a dummy page that flags when its
reload and wait_for_load_state methods are called.
"""
class DummyPage:
def __init__(self):
self.reload_called = False
self.wait_for_load_state_called = False
async def reload(self):
self.reload_called = True
async def wait_for_load_state(self):
self.wait_for_load_state_called = True
# Create a dummy session with the dummy page as the current_page.
dummy_page = DummyPage()
dummy_session = type('DummySession', (), {})()
dummy_session.current_page = dummy_page
dummy_session.context = None # Not required for this test
# Create a dummy browser mock
dummy_browser = Mock()
dummy_browser.config = Mock()
# Initialize BrowserContext with the dummy browser and config,
# and manually set its session to our dummy session.
context = BrowserContext(browser=dummy_browser, config=BrowserContextConfig())
context.session = dummy_session
# Call refresh_page and verify that reload and wait_for_load_state were called.
await context.refresh_page()
assert dummy_page.reload_called is True, 'Expected the page to call reload()'
assert dummy_page.wait_for_load_state_called is True, 'Expected the page to call wait_for_load_state()'
@pytest.mark.asyncio
async def test_remove_highlights_failure():
"""
Test the remove_highlights method to ensure that if the page.evaluate call fails,
the exception is caught and does not propagate (i.e. the method handles errors gracefully).
"""
# Dummy page that always raises an exception when evaluate is called.
class DummyPage:
async def evaluate(self, script):
raise Exception('dummy error')
# Create a dummy session with the DummyPage as current_page.
dummy_session = type('DummySession', (), {})()
dummy_session.current_page = DummyPage()
dummy_session.context = None # Not used in this test
# Create a dummy browser mock.
dummy_browser = Mock()
dummy_browser.config = Mock()
# Initialize BrowserContext with the dummy browser and configuration.
context = BrowserContext(browser=dummy_browser, config=BrowserContextConfig())
context.session = dummy_session
# Call remove_highlights and verify that no exception is raised.
try:
await context.remove_highlights()
except Exception as e:
pytest.fail(f'remove_highlights raised an exception: {e}')

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,202 @@
import asyncio
import os
import pytest
from langchain_openai import AzureChatOpenAI
from pydantic import SecretStr
from browser_use.agent.service import Agent
from browser_use.agent.views import AgentHistoryList
from browser_use.browser.browser import Browser, BrowserConfig
@pytest.fixture(scope='function')
def event_loop():
"""Create an instance of the default event loop for each test case."""
loop = asyncio.get_event_loop_policy().new_event_loop()
yield loop
loop.close()
@pytest.fixture(scope='function')
async def browser(event_loop):
browser_instance = Browser(
config=BrowserConfig(
headless=True,
)
)
yield browser_instance
await browser_instance.close()
@pytest.fixture
async def context(browser):
async with await browser.new_context() as context:
yield context
@pytest.fixture
def llm():
"""Initialize language model for testing"""
return AzureChatOpenAI(
model='gpt-4o',
api_version='2024-10-21',
azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT', ''),
api_key=SecretStr(os.getenv('AZURE_OPENAI_KEY', '')),
)
# pytest -s -k test_search_google
@pytest.mark.asyncio
async def test_search_google(llm, context):
"""Test 'Search Google' action"""
agent = Agent(
task="Search Google for 'OpenAI'.",
llm=llm,
browser_context=context,
)
history: AgentHistoryList = await agent.run(max_steps=2)
action_names = history.action_names()
assert 'search_google' in action_names
@pytest.mark.asyncio
async def test_go_to_url(llm, context):
"""Test 'Navigate to URL' action"""
agent = Agent(
task="Navigate to 'https://www.python.org'.",
llm=llm,
browser_context=context,
)
history = await agent.run(max_steps=2)
action_names = history.action_names()
assert 'go_to_url' in action_names
@pytest.mark.asyncio
async def test_go_back(llm, context):
"""Test 'Go back' action"""
agent = Agent(
task="Go to 'https://www.example.com', then go back.",
llm=llm,
browser_context=context,
)
history = await agent.run(max_steps=3)
action_names = history.action_names()
assert 'go_to_url' in action_names
assert 'go_back' in action_names
@pytest.mark.asyncio
async def test_click_element(llm, context):
"""Test 'Click element' action"""
agent = Agent(
task="Go to 'https://www.python.org' and click on the first link.",
llm=llm,
browser_context=context,
)
history = await agent.run(max_steps=4)
action_names = history.action_names()
assert 'go_to_url' in action_names or 'open_tab' in action_names
assert 'click_element_by_index' in action_names
@pytest.mark.asyncio
async def test_input_text(llm, context):
"""Test 'Input text' action"""
agent = Agent(
task="Go to 'https://www.google.com' and input 'OpenAI' into the search box.",
llm=llm,
browser_context=context,
)
history = await agent.run(max_steps=4)
action_names = history.action_names()
assert 'go_to_url' in action_names
assert 'input_text' in action_names
@pytest.mark.asyncio
async def test_switch_tab(llm, context):
"""Test 'Switch tab' action"""
agent = Agent(
task="Open new tabs with 'https://www.google.com' and 'https://www.wikipedia.org', then switch to the first tab.",
llm=llm,
browser_context=context,
)
history = await agent.run(max_steps=6)
action_names = history.action_names()
open_tab_count = action_names.count('open_tab')
assert open_tab_count >= 2
assert 'switch_tab' in action_names
@pytest.mark.asyncio
async def test_open_new_tab(llm, context):
"""Test 'Open new tab' action"""
agent = Agent(
task="Open a new tab and go to 'https://www.example.com'.",
llm=llm,
browser_context=context,
)
history = await agent.run(max_steps=3)
action_names = history.action_names()
assert 'open_tab' in action_names
@pytest.mark.asyncio
async def test_extract_page_content(llm, context):
"""Test 'Extract page content' action"""
agent = Agent(
task="Go to 'https://www.example.com' and extract the page content.",
llm=llm,
browser_context=context,
)
history = await agent.run(max_steps=3)
action_names = history.action_names()
assert 'go_to_url' in action_names
assert 'extract_content' in action_names
# pytest -k test_done_action
@pytest.mark.asyncio
async def test_done_action(llm, context):
"""Test 'Complete task' action"""
agent = Agent(
task="Navigate to 'https://www.example.com' and signal that the task is done.",
llm=llm,
browser_context=context,
)
history = await agent.run(max_steps=3)
action_names = history.action_names()
assert 'go_to_url' in action_names
assert 'done' in action_names
# run with: pytest -k test_scroll_down
@pytest.mark.asyncio
async def test_scroll_down(llm, context):
"""Test 'Scroll down' action and validate that the page actually scrolled"""
agent = Agent(
task="Go to 'https://en.wikipedia.org/wiki/Internet' and scroll down the page.",
llm=llm,
browser_context=context,
)
# Get the browser instance
page = await context.get_current_page()
# Navigate to the page and get initial scroll position
await agent.run(max_steps=1)
initial_scroll_position = await page.evaluate('window.scrollY;')
# Perform the scroll down action
await agent.run(max_steps=2)
final_scroll_position = await page.evaluate('window.scrollY;')
# Validate that the scroll position has changed
assert final_scroll_position > initial_scroll_position, 'Page did not scroll down'
# Validate that the 'scroll_down' action was executed
history = agent.history
action_names = history.action_names()
assert 'scroll_down' in action_names

View file

@ -0,0 +1,40 @@
"""
Test dropdown interaction functionality.
"""
import pytest
from browser_use.agent.service import Agent
from browser_use.agent.views import AgentHistoryList
@pytest.mark.asyncio
async def test_dropdown(llm, browser_context):
"""Test selecting an option from a dropdown menu."""
agent = Agent(
task=(
'go to https://codepen.io/geheimschriftstift/pen/mPLvQz and first get all options for the dropdown and then select the 5th option'
),
llm=llm,
browser_context=browser_context,
)
try:
history: AgentHistoryList = await agent.run(20)
result = history.final_result()
# Verify dropdown interaction
assert result is not None
assert 'Duck' in result, "Expected 5th option 'Duck' to be selected"
# Verify dropdown state
element = await browser_context.get_element_by_selector('select')
assert element is not None, 'Dropdown element should exist'
value = await element.evaluate('el => el.value')
assert value == '5', 'Dropdown should have 5th option selected'
except Exception as e:
pytest.fail(f'Dropdown test failed: {str(e)}')
finally:
await browser_context.close()

View file

@ -0,0 +1,44 @@
"""
Test complex dropdown interaction functionality.
"""
import pytest
from browser_use.agent.service import Agent
from browser_use.agent.views import AgentHistoryList
@pytest.mark.asyncio
async def test_dropdown_complex(llm, browser_context):
"""Test selecting an option from a complex dropdown menu."""
agent = Agent(
task=(
'go to https://codepen.io/shyam-king/pen/pvzpByJ and first get all options for the dropdown and then select the json option'
),
llm=llm,
browser_context=browser_context,
)
try:
history: AgentHistoryList = await agent.run(20)
result = history.final_result()
# Verify dropdown interaction
assert result is not None
assert 'json' in result.lower(), "Expected 'json' option to be selected"
# Verify dropdown state
element = await browser_context.get_element_by_selector('.select-selected')
assert element is not None, 'Custom dropdown element should exist'
text = await element.text_content()
assert 'json' in text.lower(), 'Dropdown should display json option'
# Verify the selected option's effect
code_element = await browser_context.get_element_by_selector('pre code')
assert code_element is not None, 'Code element should be visible when JSON is selected'
except Exception as e:
pytest.fail(f'Complex dropdown test failed: {str(e)}')
finally:
await browser_context.close()

View file

@ -0,0 +1,40 @@
"""
Simple try of the agent.
@dev You need to add OPENAI_API_KEY to your environment variables.
"""
import os
import sys
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import BrowserContext
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from langchain_openai import ChatOpenAI
from browser_use import Agent, AgentHistoryList
llm = ChatOpenAI(model='gpt-4o')
# browser = Browser(config=BrowserConfig(headless=False))
agent = Agent(
task=('go to https://codepen.io/shyam-king/pen/emOyjKm and select number "4" and return the output of "selected value"'),
llm=llm,
browser_context=BrowserContext(
browser=Browser(config=BrowserConfig(headless=False, disable_security=True)),
),
)
async def test_dropdown():
history: AgentHistoryList = await agent.run(20)
# await controller.browser.close(force=True)
result = history.final_result()
assert result is not None
assert '4' in result
print(result)
# await browser.close()

View file

@ -0,0 +1,98 @@
import asyncio
import os
import pytest
from langchain_openai import AzureChatOpenAI
from pydantic import SecretStr
from browser_use.agent.service import Agent
from browser_use.agent.views import AgentHistoryList
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.controller.service import Controller
# run with:
# python -m pytest tests/test_excluded_actions.py -v -k "test_only_open_tab_allowed" --capture=no
@pytest.fixture(scope='session')
def event_loop():
"""Create an instance of the default event loop for each test case."""
loop = asyncio.get_event_loop_policy().new_event_loop()
yield loop
loop.close()
@pytest.fixture(scope='session')
async def browser(event_loop):
browser_instance = Browser(
config=BrowserConfig(
headless=True,
)
)
yield browser_instance
await browser_instance.close()
@pytest.fixture
async def context(browser):
async with await browser.new_context() as context:
yield context
@pytest.fixture
def llm():
"""Initialize language model for testing"""
return AzureChatOpenAI(
model='gpt-4o',
api_version='2024-10-21',
azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT', ''),
api_key=SecretStr(os.getenv('AZURE_OPENAI_KEY', '')),
)
# pytest tests/test_excluded_actions.py -v -k "test_only_open_tab_allowed" --capture=no
@pytest.mark.asyncio
async def test_only_open_tab_allowed(llm, context):
"""Test that only open_tab action is available while others are excluded"""
# Create list of all default actions except open_tab
excluded_actions = [
'search_google',
'go_to_url',
'go_back',
'click_element',
'input_text',
'switch_tab',
'extract_content',
'done',
'scroll_down',
'scroll_up',
'send_keys',
'scroll_to_text',
'get_dropdown_options',
'select_dropdown_option',
]
# Initialize controller with excluded actions
controller = Controller(exclude_actions=excluded_actions)
# Create agent with a task that would normally use other actions
agent = Agent(
task="Go to google.com and search for 'python programming'",
llm=llm,
browser_context=context,
controller=controller,
)
history: AgentHistoryList = await agent.run(max_steps=2)
# Verify that only open_tab was used
action_names = history.action_names()
# Only open_tab should be in the actions
assert all(action == 'open_tab' for action in action_names), (
f'Found unexpected actions: {[a for a in action_names if a != "open_tab"]}'
)
# open_tab should be used at least once
assert 'open_tab' in action_names, 'open_tab action was not used'

View file

@ -0,0 +1,21 @@
import asyncio
from playwright.async_api import async_playwright
async def test_full_screen(start_fullscreen: bool, maximize: bool):
async with async_playwright() as p:
browser = await p.chromium.launch(
headless=False,
args=['--start-maximized'],
)
context = await browser.new_context(no_viewport=True, viewport=None)
page = await context.new_page()
await page.goto('https://google.com')
await asyncio.sleep(10)
await browser.close()
if __name__ == '__main__':
asyncio.run(test_full_screen(False, False))

View file

@ -0,0 +1,40 @@
"""
Simple try of the agent.
@dev You need to add OPENAI_API_KEY to your environment variables.
"""
import os
import sys
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import BrowserContext
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from langchain_openai import ChatOpenAI
from browser_use import Agent, AgentHistoryList
llm = ChatOpenAI(model='gpt-4o')
agent = Agent(
task=('go to google.com and search for text "hi there"'),
llm=llm,
browser_context=BrowserContext(
browser=Browser(config=BrowserConfig(headless=False, disable_security=True)),
),
generate_gif='./google.gif',
)
async def test_gif_path():
if os.path.exists('./google.gif'):
os.unlink('./google.gif')
history: AgentHistoryList = await agent.run(20)
result = history.final_result()
assert result is not None
assert os.path.exists('./google.gif'), 'google.gif was not created'

View file

@ -0,0 +1,137 @@
"""
Test browser automation using Mind2Web dataset tasks with pytest framework.
"""
import asyncio
import json
import os
from typing import Any
import pytest
from langchain_openai import AzureChatOpenAI
from pydantic import SecretStr
from browser_use.agent.service import Agent
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.utils import logger
# Constants
MAX_STEPS = 50
TEST_SUBSET_SIZE = 10
@pytest.fixture(scope='session')
def event_loop():
loop = asyncio.get_event_loop_policy().new_event_loop()
yield loop
loop.close()
@pytest.fixture(scope='session')
async def browser(event_loop):
browser_instance = Browser(
config=BrowserConfig(
headless=True,
)
)
yield browser_instance
await browser_instance.close()
@pytest.fixture
async def context(browser):
async with await browser.new_context() as new_context:
yield new_context
@pytest.fixture(scope='session')
def test_cases() -> list[dict[str, Any]]:
"""Load test cases from Mind2Web dataset"""
file_path = os.path.join(os.path.dirname(__file__), 'mind2web_data/processed.json')
logger.info(f'Loading test cases from {file_path}')
with open(file_path) as f:
data = json.load(f)
subset = data[:TEST_SUBSET_SIZE]
logger.info(f'Loaded {len(subset)}/{len(data)} test cases')
return subset
@pytest.fixture
def llm():
"""Initialize language model for testing"""
# return ChatAnthropic(model_name='claude-3-5-sonnet-20240620', timeout=25, stop=None)
return AzureChatOpenAI(
model='gpt-4o',
api_version='2024-10-21',
azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT', ''),
api_key=SecretStr(os.getenv('AZURE_OPENAI_KEY', '')),
)
# run with: pytest -s -v tests/test_mind2web.py:test_random_samples
@pytest.mark.asyncio
async def test_random_samples(test_cases: list[dict[str, Any]], llm, context, validator):
"""Test a random sampling of tasks across different websites"""
import random
logger.info('=== Testing Random Samples ===')
# Take random samples
samples = random.sample(test_cases, 1)
for i, case in enumerate(samples, 1):
task = f'Go to {case["website"]}.com and {case["confirmed_task"]}'
logger.info(f'--- Random Sample {i}/{len(samples)} ---')
logger.info(f'Task: {task}\n')
agent = Agent(task, llm, browser_context=context)
await agent.run()
logger.info('Validating random sample task...')
# TODO: Validate the task
def test_dataset_integrity(test_cases):
"""Test the integrity of the test dataset"""
logger.info('\n=== Testing Dataset Integrity ===')
required_fields = ['website', 'confirmed_task', 'action_reprs']
missing_fields = []
logger.info(f'Checking {len(test_cases)} test cases for required fields')
for i, case in enumerate(test_cases, 1):
logger.debug(f'Checking case {i}/{len(test_cases)}')
for field in required_fields:
if field not in case:
missing_fields.append(f'Case {i}: {field}')
logger.warning(f"Missing field '{field}' in case {i}")
# Type checks
if not isinstance(case.get('confirmed_task'), str):
logger.error(f"Case {i}: 'confirmed_task' must be string")
assert False, 'Task must be string'
if not isinstance(case.get('action_reprs'), list):
logger.error(f"Case {i}: 'action_reprs' must be list")
assert False, 'Actions must be list'
if len(case.get('action_reprs', [])) == 0:
logger.error(f"Case {i}: 'action_reprs' must not be empty")
assert False, 'Must have at least one action'
if missing_fields:
logger.error('Dataset integrity check failed')
assert False, f'Missing fields: {missing_fields}'
else:
logger.info('✅ Dataset integrity check passed')
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View file

@ -0,0 +1,160 @@
import asyncio
import os
import httpx
import pytest
from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_ollama import ChatOllama
from langchain_openai import AzureChatOpenAI, ChatOpenAI
from pydantic import SecretStr
from browser_use.agent.service import Agent
from browser_use.agent.views import AgentHistoryList
from browser_use.browser.browser import Browser, BrowserConfig
@pytest.fixture(scope='function')
def event_loop():
"""Create an instance of the default event loop for each test case."""
loop = asyncio.get_event_loop_policy().new_event_loop()
yield loop
loop.close()
@pytest.fixture(scope='function')
async def browser(event_loop):
browser_instance = Browser(
config=BrowserConfig(
headless=True,
)
)
yield browser_instance
await browser_instance.close()
@pytest.fixture
async def context(browser):
async with await browser.new_context() as context:
yield context
api_key_gemini = SecretStr(os.getenv('GOOGLE_API_KEY') or '')
api_key_deepseek = SecretStr(os.getenv('DEEPSEEK_API_KEY') or '')
api_key_anthropic = SecretStr(os.getenv('ANTHROPIC_API_KEY') or '')
# pytest -s -v tests/test_models.py
@pytest.fixture(
params=[
ChatOpenAI(model='gpt-4o'),
ChatOpenAI(model='gpt-4o-mini'),
AzureChatOpenAI(
model='gpt-4o',
api_version='2024-10-21',
azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT', ''),
api_key=SecretStr(os.getenv('AZURE_OPENAI_KEY', '')),
),
# ChatOpenAI(
# base_url='https://api.deepseek.com/v1',
# model='deepseek-reasoner',
# api_key=api_key_deepseek,
# ),
# run: ollama start
ChatOllama(
model='qwen2.5:latest',
num_ctx=128000,
),
AzureChatOpenAI(
model='gpt-4o-mini',
api_version='2024-10-21',
azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT', ''),
api_key=SecretStr(os.getenv('AZURE_OPENAI_KEY', '')),
),
ChatAnthropic(
model_name='claude-3-5-sonnet-20240620',
timeout=100,
temperature=0.0,
stop=None,
api_key=api_key_anthropic,
),
ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp', api_key=api_key_gemini),
ChatGoogleGenerativeAI(model='gemini-1.5-pro', api_key=api_key_gemini),
ChatGoogleGenerativeAI(model='gemini-1.5-flash-latest', api_key=api_key_gemini),
ChatOpenAI(
base_url='https://api.deepseek.com/v1',
model='deepseek-chat',
api_key=api_key_deepseek,
),
],
ids=[
'gpt-4o',
'gpt-4o-mini',
'azure-gpt-4o',
#'deepseek-reasoner',
'qwen2.5:latest',
'azure-gpt-4o-mini',
'claude-3-5-sonnet',
'gemini-2.0-flash-exp',
'gemini-1.5-pro',
'gemini-1.5-flash-latest',
'deepseek-chat',
],
)
async def llm(request):
return request.param
@pytest.mark.asyncio
async def test_model_search(llm, context):
"""Test 'Search Google' action"""
model_name = llm.model if hasattr(llm, 'model') else llm.model_name
print(f'\nTesting model: {model_name}')
use_vision = True
models_without_vision = ['deepseek-chat', 'deepseek-reasoner']
if hasattr(llm, 'model') and llm.model in models_without_vision:
use_vision = False
elif hasattr(llm, 'model_name') and llm.model_name in models_without_vision:
use_vision = False
# require ollama run
local_models = ['qwen2.5:latest']
if model_name in local_models:
# check if ollama is running
# ping ollama http://127.0.0.1
try:
async with httpx.AsyncClient() as client:
response = await client.get('http://127.0.0.1:11434/')
if response.status_code != 200:
raise Exception('Ollama is not running - start with `ollama start`')
except Exception:
raise Exception('Ollama is not running - start with `ollama start`')
agent = Agent(
task="Search Google for 'elon musk' then click on the first result and scroll down.",
llm=llm,
browser_context=context,
max_failures=2,
use_vision=use_vision,
)
history: AgentHistoryList = await agent.run(max_steps=2)
done = history.is_done()
successful = history.is_successful()
action_names = history.action_names()
print(f'Actions performed: {action_names}')
errors = [e for e in history.errors() if e is not None]
errors = '\n'.join(errors)
passed = False
if 'search_google' in action_names:
passed = True
elif 'go_to_url' in action_names:
passed = True
elif 'open_tab' in action_names:
passed = True
else:
passed = False
print(f'Model {model_name}: {"✅ PASSED - " if passed else "❌ FAILED - "} Done: {done} Successful: {successful}')
assert passed, f'Model {model_name} not working\nActions performed: {action_names}\nErrors: {errors}'

View file

@ -0,0 +1,66 @@
import asyncio
import pytest
from langchain_ollama import ChatOllama
from browser_use.agent.service import Agent
from browser_use.agent.views import AgentHistoryList
from browser_use.browser.browser import Browser, BrowserConfig
@pytest.fixture
def llm():
"""Initialize language model for testing"""
# return ChatAnthropic(model_name='claude-3-5-sonnet-20240620', timeout=25, stop=None)
# NOTE: Make sure to run ollama server with `ollama start'
return ChatOllama(
model='qwen2.5:latest',
num_ctx=128000,
)
@pytest.fixture(scope='session')
def event_loop():
"""Create an instance of the default event loop for each test case."""
loop = asyncio.get_event_loop_policy().new_event_loop()
yield loop
loop.close()
@pytest.fixture(scope='session')
async def browser(event_loop):
browser_instance = Browser(
config=BrowserConfig(
headless=True,
)
)
yield browser_instance
await browser_instance.close()
@pytest.fixture
async def context(browser):
async with await browser.new_context() as context:
yield context
# pytest tests/test_qwen.py -v -k "test_qwen_url" --capture=no
# @pytest.mark.asyncio
async def test_qwen_url(llm, context):
"""Test complex ecommerce interaction sequence"""
agent = Agent(
task='go_to_url amazon.com',
llm=llm,
)
history: AgentHistoryList = await agent.run(max_steps=3)
# Verify sequence of actions
action_sequence = []
for action in history.model_actions():
action_name = list(action.keys())[0]
if action_name in ['go_to_url', 'open_tab']:
action_sequence.append('navigate')
assert 'navigate' in action_sequence # Navigated to Amazon

View file

@ -0,0 +1,45 @@
"""
Simple try of the agent.
@dev You need to add OPENAI_API_KEY to your environment variables.
"""
import os
import sys
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import BrowserContext
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import asyncio
from langchain_openai import ChatOpenAI
from browser_use import Agent, AgentHistoryList
llm = ChatOpenAI(model='gpt-4o')
# browser = Browser(config=BrowserConfig(headless=False))
agent = Agent(
task=(
'go to https://codepen.io/shyam-king/pen/ByBJoOv and select "Tiger" dropdown and read the text given in "Selected Animal" box (it can be empty as well)'
),
llm=llm,
browser_context=BrowserContext(
browser=Browser(config=BrowserConfig(headless=False, disable_security=True)),
),
)
async def test_dropdown():
history: AgentHistoryList = await agent.run(10)
# await controller.browser.close(force=True)
result = history.final_result()
assert result is not None
print('result: ', result)
# await browser.close()
if __name__ == '__main__':
asyncio.run(test_dropdown())

View file

@ -0,0 +1,83 @@
"""
Simple try of the agent.
@dev You need to add OPENAI_API_KEY to your environment variables.
"""
import os
import shutil
import sys
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import BrowserContext
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from langchain_openai import ChatOpenAI
from browser_use import Agent, AgentHistoryList
llm = ChatOpenAI(model='gpt-4o')
async def test_save_conversation_contains_slash():
if os.path.exists('./logs'):
shutil.rmtree('./logs')
agent = Agent(
task=('go to google.com and search for text "hi there"'),
llm=llm,
browser_context=BrowserContext(
browser=Browser(config=BrowserConfig(headless=False, disable_security=True)),
),
save_conversation_path='logs/conversation',
)
history: AgentHistoryList = await agent.run(20)
result = history.final_result()
assert result is not None
assert os.path.exists('./logs'), 'logs directory was not created'
assert os.path.exists('./logs/conversation_2.txt'), 'logs file was not created'
async def test_save_conversation_not_contains_slash():
if os.path.exists('./logs'):
shutil.rmtree('./logs')
agent = Agent(
task=('go to google.com and search for text "hi there"'),
llm=llm,
browser_context=BrowserContext(
browser=Browser(config=BrowserConfig(headless=False, disable_security=True)),
),
save_conversation_path='logs',
)
history: AgentHistoryList = await agent.run(20)
result = history.final_result()
assert result is not None
assert os.path.exists('./logs'), 'logs directory was not created'
assert os.path.exists('./logs/_2.txt'), 'logs file was not created'
async def test_save_conversation_deep_directory():
if os.path.exists('./logs'):
shutil.rmtree('./logs')
agent = Agent(
task=('go to google.com and search for text "hi there"'),
llm=llm,
browser_context=BrowserContext(
browser=Browser(config=BrowserConfig(headless=False, disable_security=True)),
),
save_conversation_path='logs/deep/directory/conversation',
)
history: AgentHistoryList = await agent.run(20)
result = history.final_result()
assert result is not None
assert os.path.exists('./logs/deep/directory'), 'logs directory was not created'
assert os.path.exists('./logs/deep/directory/conversation_2.txt'), 'logs file was not created'

View file

@ -0,0 +1,198 @@
import asyncio
import os
import pytest
from langchain_openai import AzureChatOpenAI
from pydantic import BaseModel, SecretStr
from browser_use.agent.service import Agent
from browser_use.agent.views import AgentHistoryList
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.controller.service import Controller
@pytest.fixture(scope='session')
def event_loop():
loop = asyncio.get_event_loop_policy().new_event_loop()
yield loop
loop.close()
@pytest.fixture(scope='session')
async def browser(event_loop):
browser_instance = Browser(
config=BrowserConfig(
headless=True,
)
)
yield browser_instance
await browser_instance.close()
@pytest.fixture
async def context(browser):
async with await browser.new_context() as context:
yield context
@pytest.fixture
async def controller():
"""Initialize the controller with self-registered actions"""
controller = Controller()
# Define custom actions without Pydantic models
@controller.action('Print a message')
def print_message(message: str):
print(f'Message: {message}')
return f'Printed message: {message}'
@controller.action('Add two numbers')
def add_numbers(a: int, b: int):
result = a + b
return f'The sum is {result}'
@controller.action('Concatenate strings')
def concatenate_strings(str1: str, str2: str):
result = str1 + str2
return f'Concatenated string: {result}'
# Define Pydantic models
class SimpleModel(BaseModel):
name: str
age: int
class Address(BaseModel):
street: str
city: str
class NestedModel(BaseModel):
user: SimpleModel
address: Address
# Add actions with Pydantic model arguments
@controller.action('Process simple model', param_model=SimpleModel)
def process_simple_model(model: SimpleModel):
return f'Processed {model.name}, age {model.age}'
@controller.action('Process nested model', param_model=NestedModel)
def process_nested_model(model: NestedModel):
user_info = f'{model.user.name}, age {model.user.age}'
address_info = f'{model.address.street}, {model.address.city}'
return f'Processed user {user_info} at address {address_info}'
@controller.action('Process multiple models')
def process_multiple_models(model1: SimpleModel, model2: Address):
return f'Processed {model1.name} living at {model2.street}, {model2.city}'
yield controller
@pytest.fixture
def llm():
"""Initialize language model for testing"""
# return ChatAnthropic(model_name='claude-3-5-sonnet-20240620', timeout=25, stop=None)
return AzureChatOpenAI(
model='gpt-4o',
api_version='2024-10-21',
azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT', ''),
api_key=SecretStr(os.getenv('AZURE_OPENAI_KEY', '')),
)
# @pytest.mark.skip(reason="Skipping test for now")
@pytest.mark.asyncio
async def test_self_registered_actions_no_pydantic(llm, controller):
"""Test self-registered actions with individual arguments"""
agent = Agent(
task="First, print the message 'Hello, World!'. Then, add 10 and 20. Next, concatenate 'foo' and 'bar'.",
llm=llm,
controller=controller,
)
history: AgentHistoryList = await agent.run(max_steps=10)
# Check that custom actions were executed
action_names = history.action_names()
assert 'print_message' in action_names
assert 'add_numbers' in action_names
assert 'concatenate_strings' in action_names
# @pytest.mark.skip(reason="Skipping test for now")
@pytest.mark.asyncio
async def test_mixed_arguments_actions(llm, controller):
"""Test actions with mixed argument types"""
# Define another action during the test
# Test for async actions
@controller.action('Calculate the area of a rectangle')
async def calculate_area(length: float, width: float):
area = length * width
return f'The area is {area}'
agent = Agent(
task='Calculate the area of a rectangle with length 5.5 and width 3.2.',
llm=llm,
controller=controller,
)
history = await agent.run(max_steps=5)
# Check that the action was executed
action_names = history.action_names()
assert 'calculate_area' in action_names
# check result
correct = 'The area is 17.6'
for content in history.extracted_content():
if correct in content:
break
else:
pytest.fail(f'{correct} not found in extracted content')
@pytest.mark.asyncio
async def test_pydantic_simple_model(llm, controller):
"""Test action with a simple Pydantic model argument"""
agent = Agent(
task="Process a simple model with name 'Alice' and age 30.",
llm=llm,
controller=controller,
)
history = await agent.run(max_steps=5)
# Check that the action was executed
action_names = history.action_names()
assert 'process_simple_model' in action_names
correct = 'Processed Alice, age 30'
for content in history.extracted_content():
if correct in content:
break
else:
pytest.fail(f'{correct} not found in extracted content')
@pytest.mark.asyncio
async def test_pydantic_nested_model(llm, controller):
"""Test action with a nested Pydantic model argument"""
agent = Agent(
task="Process a nested model with user name 'Bob', age 25, living at '123 Maple St', 'Springfield'.",
llm=llm,
controller=controller,
)
history = await agent.run(max_steps=5)
# Check that the action was executed
action_names = history.action_names()
assert 'process_nested_model' in action_names
correct = 'Processed user Bob, age 25 at address 123 Maple St, Springfield'
for content in history.extracted_content():
if correct in content:
break
else:
pytest.fail(f'{correct} not found in extracted content')
# run this file with:
# pytest tests/test_self_registered_actions.py --capture=no

View file

@ -0,0 +1,91 @@
import pytest
from langchain_core.messages import HumanMessage, SystemMessage
from pydantic import BaseModel, Field
from browser_use.agent.message_manager.service import MessageManager, MessageManagerSettings
from browser_use.agent.views import MessageManagerState
from browser_use.controller.registry.service import Registry
class SensitiveParams(BaseModel):
"""Test parameter model for sensitive data testing."""
text: str = Field(description='Text with sensitive data placeholders')
@pytest.fixture
def registry():
return Registry()
@pytest.fixture
def message_manager():
return MessageManager(
task='Test task',
system_message=SystemMessage(content='System message'),
settings=MessageManagerSettings(),
state=MessageManagerState(),
)
def test_replace_sensitive_data_with_missing_keys(registry):
"""Test that _replace_sensitive_data handles missing keys gracefully"""
# Create a simple Pydantic model with sensitive data placeholders
params = SensitiveParams(text='Please enter <secret>username</secret> and <secret>password</secret>')
# Case 1: All keys present
sensitive_data = {'username': 'user123', 'password': 'pass456'}
result = registry._replace_sensitive_data(params, sensitive_data)
assert 'user123' in result.text
assert 'pass456' in result.text
# Both keys should be replaced
# Case 2: One key missing
sensitive_data = {'username': 'user123'} # password is missing
result = registry._replace_sensitive_data(params, sensitive_data)
assert 'user123' in result.text
assert '<secret>password</secret>' in result.text
# Verify the behavior - username replaced, password kept as tag
# Case 3: Multiple keys missing
sensitive_data = {} # both keys missing
result = registry._replace_sensitive_data(params, sensitive_data)
assert '<secret>username</secret>' in result.text
assert '<secret>password</secret>' in result.text
# Verify both tags are preserved when keys are missing
# Case 4: One key empty
sensitive_data = {'username': 'user123', 'password': ''}
result = registry._replace_sensitive_data(params, sensitive_data)
assert 'user123' in result.text
assert '<secret>password</secret>' in result.text
# Empty value should be treated the same as missing key
def test_filter_sensitive_data(message_manager):
"""Test that _filter_sensitive_data handles all sensitive data scenarios correctly"""
# Set up a message with sensitive information
message = HumanMessage(content='My username is admin and password is secret123')
# Case 1: No sensitive data provided
message_manager.settings.sensitive_data = None
result = message_manager._filter_sensitive_data(message)
assert result.content == 'My username is admin and password is secret123'
# Case 2: All sensitive data is properly replaced
message_manager.settings.sensitive_data = {'username': 'admin', 'password': 'secret123'}
result = message_manager._filter_sensitive_data(message)
assert '<secret>username</secret>' in result.content
assert '<secret>password</secret>' in result.content
# Case 3: Make sure it works with nested content
nested_message = HumanMessage(content=[{'type': 'text', 'text': 'My username is admin and password is secret123'}])
result = message_manager._filter_sensitive_data(nested_message)
assert '<secret>username</secret>' in result.content[0]['text']
assert '<secret>password</secret>' in result.content[0]['text']
# Case 4: Test with empty values
message_manager.settings.sensitive_data = {'username': 'admin', 'password': ''}
result = message_manager._filter_sensitive_data(message)
assert '<secret>username</secret>' in result.content
# Only username should be replaced since password is empty

View file

@ -0,0 +1,344 @@
from unittest.mock import AsyncMock, MagicMock, Mock, patch
import pytest
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.messages import HumanMessage
from pydantic import BaseModel
from browser_use.agent.service import Agent
from browser_use.agent.views import ActionResult
from browser_use.browser.browser import Browser
from browser_use.browser.context import BrowserContext
from browser_use.browser.views import BrowserState
from browser_use.controller.registry.service import Registry
from browser_use.controller.registry.views import ActionModel
from browser_use.controller.service import Controller
# run with python -m pytest tests/test_service.py
# run test with:
# python -m pytest tests/test_service.py
class TestAgent:
@pytest.fixture
def mock_controller(self):
controller = Mock(spec=Controller)
registry = Mock(spec=Registry)
registry.registry = MagicMock()
registry.registry.actions = {'test_action': MagicMock(param_model=MagicMock())} # type: ignore
controller.registry = registry
return controller
@pytest.fixture
def mock_llm(self):
return Mock(spec=BaseChatModel)
@pytest.fixture
def mock_browser(self):
return Mock(spec=Browser)
@pytest.fixture
def mock_browser_context(self):
return Mock(spec=BrowserContext)
def test_convert_initial_actions(self, mock_controller, mock_llm, mock_browser, mock_browser_context): # type: ignore
"""
Test that the _convert_initial_actions method correctly converts
dictionary-based actions to ActionModel instances.
This test ensures that:
1. The method processes the initial actions correctly.
2. The correct param_model is called with the right parameters.
3. The ActionModel is created with the validated parameters.
4. The method returns a list of ActionModel instances.
"""
# Arrange
agent = Agent(
task='Test task', llm=mock_llm, controller=mock_controller, browser=mock_browser, browser_context=mock_browser_context
)
initial_actions = [{'test_action': {'param1': 'value1', 'param2': 'value2'}}]
# Mock the ActionModel
mock_action_model = MagicMock(spec=ActionModel)
mock_action_model_instance = MagicMock()
mock_action_model.return_value = mock_action_model_instance
agent.ActionModel = mock_action_model # type: ignore
# Act
result = agent._convert_initial_actions(initial_actions)
# Assert
assert len(result) == 1
mock_controller.registry.registry.actions['test_action'].param_model.assert_called_once_with( # type: ignore
param1='value1', param2='value2'
)
mock_action_model.assert_called_once()
assert isinstance(result[0], MagicMock)
assert result[0] == mock_action_model_instance
# Check that the ActionModel was called with the correct parameters
call_args = mock_action_model.call_args[1]
assert 'test_action' in call_args
assert call_args['test_action'] == mock_controller.registry.registry.actions['test_action'].param_model.return_value # type: ignore
@pytest.mark.asyncio
async def test_step_error_handling(self):
"""
Test the error handling in the step method of the Agent class.
This test simulates a failure in the get_next_action method and
checks if the error is properly handled and recorded.
"""
# Mock the LLM
mock_llm = MagicMock(spec=BaseChatModel)
# Mock the MessageManager
with patch('browser_use.agent.service.MessageManager') as mock_message_manager:
# Create an Agent instance with mocked dependencies
agent = Agent(task='Test task', llm=mock_llm)
# Mock the get_next_action method to raise an exception
agent.get_next_action = AsyncMock(side_effect=ValueError('Test error'))
# Mock the browser_context
agent.browser_context = AsyncMock()
agent.browser_context.get_state = AsyncMock(
return_value=BrowserState(
url='https://example.com',
title='Example',
element_tree=MagicMock(), # Mocked element tree
tabs=[],
selector_map={},
screenshot='',
)
)
# Mock the controller
agent.controller = AsyncMock()
# Call the step method
await agent.step()
# Assert that the error was handled and recorded
assert agent.consecutive_failures == 1
assert len(agent._last_result) == 1
assert isinstance(agent._last_result[0], ActionResult)
assert 'Test error' in agent._last_result[0].error
assert agent._last_result[0].include_in_memory is True
class TestRegistry:
@pytest.fixture
def registry_with_excludes(self):
return Registry(exclude_actions=['excluded_action'])
def test_action_decorator_with_excluded_action(self, registry_with_excludes):
"""
Test that the action decorator does not register an action
if it's in the exclude_actions list.
"""
# Define a function to be decorated
def excluded_action():
pass
# Apply the action decorator
decorated_func = registry_with_excludes.action(description='This should be excluded')(excluded_action)
# Assert that the decorated function is the same as the original
assert decorated_func == excluded_action
# Assert that the action was not added to the registry
assert 'excluded_action' not in registry_with_excludes.registry.actions
# Define another function that should be included
def included_action():
pass
# Apply the action decorator to an included action
registry_with_excludes.action(description='This should be included')(included_action)
# Assert that the included action was added to the registry
assert 'included_action' in registry_with_excludes.registry.actions
@pytest.mark.asyncio
async def test_execute_action_with_and_without_browser_context(self):
"""
Test that the execute_action method correctly handles actions with and without a browser context.
This test ensures that:
1. An action requiring a browser context is executed correctly.
2. An action not requiring a browser context is executed correctly.
3. The browser context is passed to the action function when required.
4. The action function receives the correct parameters.
5. The method raises an error when a browser context is required but not provided.
"""
registry = Registry()
# Define a mock action model
class TestActionModel(BaseModel):
param1: str
# Define mock action functions
async def test_action_with_browser(param1: str, browser):
return f'Action executed with {param1} and browser'
async def test_action_without_browser(param1: str):
return f'Action executed with {param1}'
# Register the actions
registry.registry.actions['test_action_with_browser'] = MagicMock(
function=AsyncMock(side_effect=test_action_with_browser),
param_model=TestActionModel,
description='Test action with browser',
)
registry.registry.actions['test_action_without_browser'] = MagicMock(
function=AsyncMock(side_effect=test_action_without_browser),
param_model=TestActionModel,
description='Test action without browser',
)
# Mock BrowserContext
mock_browser = MagicMock()
# Execute the action with a browser context
result_with_browser = await registry.execute_action(
'test_action_with_browser', {'param1': 'test_value'}, browser=mock_browser
)
assert result_with_browser == 'Action executed with test_value and browser'
# Execute the action without a browser context
result_without_browser = await registry.execute_action('test_action_without_browser', {'param1': 'test_value'})
assert result_without_browser == 'Action executed with test_value'
# Test error when browser is required but not provided
with pytest.raises(RuntimeError, match='Action test_action_with_browser requires browser but none provided'):
await registry.execute_action('test_action_with_browser', {'param1': 'test_value'})
# Verify that the action functions were called with correct parameters
registry.registry.actions['test_action_with_browser'].function.assert_called_once_with(
param1='test_value', browser=mock_browser
)
registry.registry.actions['test_action_without_browser'].function.assert_called_once_with(param1='test_value')
class TestAgentRetry:
@pytest.fixture
def mock_llm(self):
return AsyncMock()
@pytest.fixture
def mock_controller(self):
controller = Mock()
controller.registry = Mock()
controller.registry.registry = Mock()
controller.registry.registry.actions = {}
return controller
@pytest.fixture
def mock_browser_context(self):
browser_context = Mock()
browser_context.get_state = AsyncMock(
return_value=BrowserState(
url='https://parabank.parasoft.com/parabank/index.htm',
title='ParaBank',
element_tree=MagicMock(),
tabs=[],
selector_map={},
screenshot='',
)
)
return browser_context
@pytest.fixture
def mock_action_model(self):
action_model = Mock(spec=ActionModel)
return action_model
@pytest.mark.asyncio
async def test_step_empty_action_retry(self, mock_llm, mock_controller, mock_browser_context, mock_action_model):
"""
Test that the step method retries and handles empty actions correctly.
"""
# Arrange
agent = Agent(
task='Test task',
llm=mock_llm,
controller=mock_controller,
browser=Mock(),
browser_context=mock_browser_context,
)
agent.ActionModel = mock_action_model # Inject the mock ActionModel
# Mock get_next_action to return empty action the first time, then a valid action
empty_model_output = MagicMock()
empty_model_output.action = [] # Empty action
valid_model_output = MagicMock()
valid_action = MagicMock()
valid_model_output.action = [valid_action]
mock_llm.return_value.invoke.side_effect = [empty_model_output, valid_model_output]
agent.get_next_action = mock_llm.return_value.invoke
# Act
await agent.step()
# Assert
# Check that get_next_action was called twice (initial call + retry)
assert agent.get_next_action.call_count == 2
# Check that the LLM was called twice
assert mock_llm.return_value.invoke.call_count == 2
# Check that the second call to get_next_action included the clarification message
_, retry_messages = mock_llm.return_value.invoke.call_args_list[1]
assert len(retry_messages[0]) == 2 # input_messages + clarification message
assert isinstance(retry_messages[0][1], HumanMessage)
assert 'You forgot to return an action' in retry_messages[0][1].content
# Check that _last_result contains the valid action
assert len(agent._last_result) == 1
assert agent._last_result[0].action == valid_action
@pytest.mark.asyncio
async def test_step_empty_action_retry_and_fail(self, mock_llm, mock_controller, mock_browser_context, mock_action_model):
"""
Test that the step method handles the case where get_next_action returns
empty actions twice, and inserts a safe noop action.
"""
# Arrange
agent = Agent(
task='Test task',
llm=mock_llm,
controller=mock_controller,
browser=Mock(),
browser_context=mock_browser_context,
)
agent.ActionModel = mock_action_model # Inject the mock ActionModel
# Mock get_next_action to return empty action both times
empty_model_output = MagicMock()
empty_model_output.action = [] # Empty action
mock_llm.return_value.invoke.return_value = empty_model_output
agent.get_next_action = mock_llm.return_value.invoke
# Mock the ActionModel instance creation
mock_action_instance = MagicMock()
mock_action_model.return_value = mock_action_instance
# Act
await agent.step()
# Assert
# Check that get_next_action was called twice
assert agent.get_next_action.call_count == 2
# Check that the LLM was called twice
assert mock_llm.return_value.invoke.call_count == 2
# Check that ActionModel was instantiated with the noop action
mock_action_model.assert_called_once()
call_args = mock_action_model.call_args[1]
assert 'done' in call_args
assert call_args['done'] == {'success': False, 'text': 'No action returned, safe exit.'}
# Check that _last_result contains the noop action
assert len(agent._last_result) == 1
assert agent._last_result[0].action == mock_action_instance

View file

@ -0,0 +1,115 @@
import asyncio
import os
import random
import string
import time
import pytest
from langchain_openai import AzureChatOpenAI
from pydantic import SecretStr
from browser_use.agent.service import Agent
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.controller.service import Controller
@pytest.fixture(scope='session')
def event_loop():
loop = asyncio.get_event_loop_policy().new_event_loop()
yield loop
loop.close()
@pytest.fixture(scope='session')
async def browser(event_loop):
browser_instance = Browser(
config=BrowserConfig(
headless=True,
)
)
yield browser_instance
await browser_instance.close()
@pytest.fixture
async def context(browser):
async with await browser.new_context() as context:
yield context
@pytest.fixture
def llm():
"""Initialize the language model"""
model = AzureChatOpenAI(
api_version='2024-10-21',
model='gpt-4o',
azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT', ''),
api_key=SecretStr(os.getenv('AZURE_OPENAI_KEY', '')),
)
return model
def generate_random_text(length: int) -> str:
"""Generate random text of specified length"""
return ''.join(random.choices(string.ascii_letters + string.digits + ' ', k=length))
@pytest.fixture
async def controller():
"""Initialize the controller"""
controller = Controller()
large_text = generate_random_text(10000)
@controller.action('call this magical function to get very special text')
def get_very_special_text():
return large_text
yield controller
@pytest.mark.asyncio
async def test_token_limit_with_multiple_extractions(llm, controller, context):
"""Test handling of multiple smaller extractions accumulating tokens"""
agent = Agent(
task='Call the magical function to get very special text 5 times',
llm=llm,
controller=controller,
browser_context=context,
max_input_tokens=2000,
save_conversation_path='tmp/stress_test/test_token_limit_with_multiple_extractions.json',
)
history = await agent.run(max_steps=5)
# check if 5 times called get_special_text
calls = [a for a in history.action_names() if a == 'get_very_special_text']
assert len(calls) == 5
# check the message history should be max 3 messages
assert len(agent.message_manager.history.messages) > 3
@pytest.mark.slow
@pytest.mark.parametrize('max_tokens', [4000]) # 8000 20000
@pytest.mark.asyncio
async def test_open_3_tabs_and_extract_content(llm, controller, context, max_tokens):
"""Stress test: Open 3 tabs with urls and extract content"""
agent = Agent(
task='Open 3 tabs with https://en.wikipedia.org/wiki/Internet and extract the content from each.',
llm=llm,
controller=controller,
browser_context=context,
max_input_tokens=max_tokens,
save_conversation_path='tmp/stress_test/test_open_3_tabs_and_extract_content.json',
)
start_time = time.time()
history = await agent.run(max_steps=7)
end_time = time.time()
total_time = end_time - start_time
print(f'Total time: {total_time:.2f} seconds')
# Check for errors
errors = history.errors()
assert len(errors) == 0, 'Errors occurred during the test'
# check if 3 tabs were opened
assert len(context.current_state.tabs) >= 3, '3 tabs were not opened'

View file

@ -0,0 +1,575 @@
import asyncio
import logging
import pytest
from dotenv import load_dotenv
from pytest_httpserver import HTTPServer
load_dotenv()
from browser_use.agent.views import ActionModel
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import BrowserContext
from browser_use.controller.service import Controller
from browser_use.controller.views import (
CloseTabAction,
GoToUrlAction,
OpenTabAction,
SwitchTabAction,
)
# Set up test logging
logger = logging.getLogger('tab_tests')
logger.setLevel(logging.DEBUG)
class TestTabManagement:
"""Tests for the tab management system with separate agent_current_page and human_current_page references."""
@pytest.fixture(scope='module')
def event_loop(self):
"""Create and provide an event loop for async tests."""
loop = asyncio.get_event_loop_policy().new_event_loop()
yield loop
loop.close()
@pytest.fixture(scope='module')
def http_server(self):
"""Create and provide a test HTTP server that serves static content."""
server = HTTPServer()
server.start()
# Add routes for test pages
server.expect_request('/page1').respond_with_data(
'<html><head><title>Test Page 1</title></head><body><h1>Test Page 1</h1></body></html>', content_type='text/html'
)
server.expect_request('/page2').respond_with_data(
'<html><head><title>Test Page 2</title></head><body><h1>Test Page 2</h1></body></html>', content_type='text/html'
)
server.expect_request('/page3').respond_with_data(
'<html><head><title>Test Page 3</title></head><body><h1>Test Page 3</h1></body></html>', content_type='text/html'
)
server.expect_request('/page4').respond_with_data(
'<html><head><title>Test Page 4</title></head><body><h1>Test Page 4</h1></body></html>', content_type='text/html'
)
yield server
server.stop()
@pytest.fixture(scope='module')
async def browser(self, event_loop):
"""Create and provide a Browser instance with security disabled."""
browser_instance = Browser(
config=BrowserConfig(
headless=True,
)
)
yield browser_instance
await browser_instance.close()
@pytest.fixture
async def browser_context(self, browser, http_server):
"""Create and provide a BrowserContext instance with a properly initialized tab."""
context = BrowserContext(browser=browser)
# Initialize a session
session = await context.get_session()
# Ensure we start with no pages (close any that might exist)
for page in session.context.pages:
await page.close()
# Create an initial tab and wait for it to load completely
base_url = f'http://{http_server.host}:{http_server.port}'
await context.create_new_tab(f'{base_url}/page1')
await asyncio.sleep(1) # Wait for the tab to fully initialize
# Verify that agent_current_page and human_current_page are properly set
assert context.agent_current_page is not None
assert context.human_current_page is not None
assert f'{http_server.host}:{http_server.port}' in context.agent_current_page.url
yield context
await context.close()
@pytest.fixture
def controller(self):
"""Create and provide a Controller instance."""
return Controller()
@pytest.fixture
def base_url(self, http_server):
"""Return the base URL for the test HTTP server."""
return f'http://{http_server.host}:{http_server.port}'
# Helper methods
async def _execute_action(self, controller, browser_context, action_data):
"""Generic helper to execute any action via the controller."""
# Dynamically create an appropriate ActionModel class
action_type = list(action_data.keys())[0]
action_value = action_data[action_type]
# Create the ActionModel with the single action field
class DynamicActionModel(ActionModel):
pass
# Dynamically add the field with the right type annotation
setattr(DynamicActionModel, action_type, type(action_value) | None)
# Execute the action
result = await controller.act(DynamicActionModel(**action_data), browser_context)
# Give the browser a moment to process the action
await asyncio.sleep(0.5)
return result
async def _ensure_synchronized_state(self, browser_context, base_url):
"""Helper to ensure tab references are properly synchronized before tests."""
# Make sure agent_current_page and human_current_page are set and valid
session = await browser_context.get_session()
if not browser_context.agent_current_page or browser_context.agent_current_page not in session.context.pages:
if session.context.pages:
browser_context.agent_current_page = session.context.pages[0]
else:
# Create a tab with the test server
await browser_context.create_new_tab(f'{base_url}/page1')
await asyncio.sleep(1) # Wait longer for tab to initialize
if not browser_context.human_current_page or browser_context.human_current_page not in session.context.pages:
browser_context.human_current_page = browser_context.agent_current_page
async def _simulate_user_tab_change(self, page, browser_context):
"""Simulate a user changing tabs by properly triggering events with Playwright."""
logger.debug(
f'BEFORE: agent_tab={browser_context.agent_current_page.url if browser_context.agent_current_page else "None"}, '
f'human_current_page={browser_context.human_current_page.url if browser_context.human_current_page else "None"}'
)
logger.debug(f'Simulating user changing to -> {page.url}')
# First bring the page to front - this is the physical action a user would take
await page.bring_to_front()
# To simulate a user switching tabs, we need to trigger the right events
# Use Playwright's dispatch_event method to properly trigger events from outside
await page.dispatch_event('body', 'focus')
# await page.evaluate("""() => window.dispatchEvent(new Event('focus'))""")
# await page.evaluate(
# """() => document.dispatchEvent(new Event('pointermove', { bubbles: true, cancelable: false, clientX: 0, clientY: 0 }))"""
# )
# await page.evaluate(
# "() => document.dispatchEvent(new Event('deviceorientation', { bubbles: true, cancelable: false, alpha: 0, beta: 0, gamma: 0 }))"
# )
# await page.evaluate(
# """() => document.dispatchEvent(new Event('visibilitychange', { bubbles: true, cancelable: false }))"""
# )
# logger.debug('Dispatched window.focus event')
# cheat for now, because playwright really messes with foreground tab detection
# TODO: fix this properly by triggering the right events and detecting them in playwright
await page.evaluate("""() => {
const listener = Object.keys(window).filter(k => k.startsWith('onVisibilityChange'))[0]
if (listener) {
window[listener]({ bubbles: true, cancelable: false })
}
}""")
# Give the event handlers time to process
await asyncio.sleep(0.5)
logger.debug(
f'AFTER: agent_tab URL={browser_context.agent_current_page.url if browser_context.agent_current_page else "None"}, '
f'human_current_page URL={browser_context.human_current_page.url if browser_context.human_current_page else "None"}'
)
# Tab management tests
@pytest.mark.asyncio
async def test_open_tab_updates_both_references(self, browser_context, base_url):
"""Test that open_tab correctly updates both tab references."""
# Ensure tab references are synchronized
await self._ensure_synchronized_state(browser_context, base_url)
# Store initial tab count and references
session = await browser_context.get_session()
initial_tab_count = len(session.context.pages)
initial_agent_tab = browser_context.agent_current_page
# Open a new tab directly via BrowserContext
await browser_context.create_new_tab(f'{base_url}/page2')
# Give time for events to process
await asyncio.sleep(1)
# Verify a new tab was created
session = await browser_context.get_session()
assert len(session.context.pages) == initial_tab_count + 1
# Both references should be set to the new tab and different from initial tab
assert browser_context.human_current_page is not None
assert browser_context.agent_current_page is not None
assert browser_context.human_current_page == browser_context.agent_current_page
assert initial_agent_tab != browser_context.agent_current_page
assert f'{base_url}/page2' in browser_context.agent_current_page.url
@pytest.mark.asyncio
async def test_switch_tab_updates_both_references(self, browser_context, base_url):
"""Test that switch_tab updates both tab references."""
# Ensure we start with at least one tab
await self._ensure_synchronized_state(browser_context, base_url)
# Create a new tab in addition to existing one
await browser_context.create_new_tab(f'{base_url}/page2')
await asyncio.sleep(1)
# Verify we now have the second tab active
assert f'{base_url}/page2' in browser_context.agent_current_page.url
# Switch to the first tab
session = await browser_context.get_session()
first_tab = session.context.pages[0]
await browser_context.switch_to_tab(0)
await asyncio.sleep(0.5)
# Both references should point to the first tab
assert browser_context.human_current_page is not None
assert browser_context.agent_current_page is not None
assert browser_context.human_current_page == browser_context.agent_current_page
assert browser_context.agent_current_page == first_tab
assert f'{base_url}/page1' in browser_context.agent_current_page.url
# Verify the underlying page is correct by checking we can interact with it
page = await browser_context.get_agent_current_page()
title = await page.title()
assert 'Test Page 1' in title
@pytest.mark.asyncio
async def test_close_tab_handles_references_correctly(self, browser_context, base_url):
"""Test that closing a tab updates references correctly."""
# Ensure we start with at least one tab
await self._ensure_synchronized_state(browser_context, base_url)
# Create two tabs with different URLs
initial_tab = browser_context.agent_current_page
await browser_context.create_new_tab(f'{base_url}/page2')
await asyncio.sleep(1)
# Verify the second tab is now active
assert f'{base_url}/page2' in browser_context.agent_current_page.url
# Close the current tab
await browser_context.close_current_tab()
await asyncio.sleep(0.5)
# Both references should be updated to the remaining available tab
assert browser_context.human_current_page is not None
assert browser_context.agent_current_page is not None
assert browser_context.human_current_page == browser_context.agent_current_page
assert browser_context.agent_current_page == initial_tab
assert not browser_context.human_current_page.is_closed()
assert f'{base_url}/page1' in browser_context.human_current_page.url
@pytest.mark.asyncio
async def test_user_changes_tab(self, browser_context, base_url):
"""Test that agent_current_page is preserved when user changes the foreground tab."""
# Ensure we start with at least one tab
await self._ensure_synchronized_state(browser_context, base_url)
# Create a second tab with a different URL
await browser_context.create_new_tab(f'{base_url}/page2')
await asyncio.sleep(1)
assert f'{base_url}/page2' in browser_context.agent_current_page.url
# Switch back to the first tab for the agent
session = await browser_context.get_session()
first_tab = session.context.pages[0]
await browser_context.switch_to_tab(0)
await self._simulate_user_tab_change(first_tab, browser_context)
await asyncio.sleep(0.5)
# Store agent's active tab
agent_tab = browser_context.agent_current_page
assert f'{base_url}/page1' in agent_tab.url
# Simulate user switching to the second tab
session = await browser_context.get_session()
user_tab = session.context.pages[1] # Second tab
# First, log the visibility listeners
listeners = await user_tab.evaluate("() => Object.keys(window).filter(k => k.startsWith('onVisibilityChange'))")
logger.debug(f'Tab visibility listeners: {listeners}')
# Make sure handlers exist before attempting to trigger them
assert len(listeners) > 0, 'No visibility listeners found on the page'
# Now try the simulation
await self._simulate_user_tab_change(user_tab, browser_context)
# Verify agent_current_page remains unchanged while human_current_page changed
assert browser_context.agent_current_page == agent_tab
assert browser_context.human_current_page != browser_context.agent_current_page
assert f'{base_url}/page1' in browser_context.agent_current_page.url
assert f'{base_url}/page2' in browser_context.human_current_page.url
@pytest.mark.asyncio
async def test_get_agent_current_page(self, browser_context, base_url):
"""Test that get_agent_current_page returns agent_current_page regardless of human_current_page."""
# Ensure we start with at least one tab
await self._ensure_synchronized_state(browser_context, base_url)
# Create a second tab with a different URL
await browser_context.create_new_tab(f'{base_url}/page2')
await asyncio.sleep(1)
# Switch back to the first tab for the agent
await browser_context.switch_to_tab(0)
await asyncio.sleep(0.5)
# Simulate user switching to the second tab
session = await browser_context.get_session()
user_tab = session.context.pages[1] # Second tab
await self._simulate_user_tab_change(user_tab, browser_context)
# Verify get_agent_current_page returns agent's tab, not foreground tab
agent_page = await browser_context.get_agent_current_page()
assert agent_page == browser_context.agent_current_page
assert f'{base_url}/page1' in agent_page.url
# Call a method on the page to verify it's fully functional
title = await agent_page.title()
assert 'Test Page 1' in title
@pytest.mark.asyncio
async def test_browser_operations_use_agent_current_page(self, browser_context, base_url):
"""Test that browser operations use agent_current_page, not human_current_page."""
# Ensure we start with at least one tab
await self._ensure_synchronized_state(browser_context, base_url)
# Create a second tab with a different URL
await browser_context.create_new_tab(f'{base_url}/page2')
await asyncio.sleep(1)
# Switch back to the first tab for the agent
await browser_context.switch_to_tab(0)
await asyncio.sleep(0.5)
# Simulate user switching to the second tab
session = await browser_context.get_session()
user_tab = session.context.pages[1] # Second tab
await self._simulate_user_tab_change(user_tab, browser_context)
# Verify we have the setup we want
assert browser_context.human_current_page != browser_context.agent_current_page
assert f'{base_url}/page2' in browser_context.human_current_page.url
assert f'{base_url}/page1' in browser_context.agent_current_page.url
# Execute a navigation directly on agent's tab
agent_page = await browser_context.get_agent_current_page()
await agent_page.goto(f'{base_url}/page3')
await asyncio.sleep(0.5)
# Verify navigation happened on agent_current_page
assert f'{base_url}/page3' in browser_context.agent_current_page.url
# But human_current_page remains unchanged
assert f'{base_url}/page2' in browser_context.human_current_page.url
@pytest.mark.asyncio
async def test_tab_reference_recovery(self, browser_context, base_url):
"""Test recovery when a tab reference becomes invalid."""
# Ensure we start with at least one valid tab
await self._ensure_synchronized_state(browser_context, base_url)
# Create a second tab so we have multiple
await browser_context.create_new_tab(f'{base_url}/page2')
await asyncio.sleep(1)
# Deliberately corrupt the agent_current_page reference
browser_context.agent_current_page = None
# Call get_agent_current_page, which should recover the reference
agent_page = await browser_context.get_agent_current_page()
# Verify recovery worked
assert agent_page is not None
assert not agent_page.is_closed()
# Verify the tab is fully functional
title = await agent_page.title()
assert title, 'Page should have a title'
# Verify both references are now valid again
assert browser_context.agent_current_page is not None
assert browser_context.human_current_page is not None
@pytest.mark.asyncio
async def test_reconcile_tab_state_handles_both_invalid(self, browser_context, base_url):
"""Test that reconcile_tab_state can recover when both tab references are invalid."""
# Ensure we start with at least one valid tab
await self._ensure_synchronized_state(browser_context, base_url)
# Corrupt both references
browser_context.agent_current_page = None
browser_context.human_current_page = None
# Call reconcile_tab_state directly
await browser_context._reconcile_tab_state()
# Verify both references are restored
assert browser_context.agent_current_page is not None
assert browser_context.human_current_page is not None
# and they are the same tab
assert browser_context.agent_current_page == browser_context.human_current_page
# and the tab is valid
assert not browser_context.agent_current_page.is_closed()
@pytest.mark.asyncio
async def test_race_condition_resilience(self, browser_context, base_url):
"""Test resilience against race conditions in tab operations."""
# Ensure we start with at least one valid tab
await self._ensure_synchronized_state(browser_context, base_url)
# Create two more tabs to have three in total
await browser_context.create_new_tab(f'{base_url}/page2')
await asyncio.sleep(0.5)
await browser_context.create_new_tab(f'{base_url}/page3')
await asyncio.sleep(0.5)
# Verify we have at least 3 tabs
session = await browser_context.get_session()
assert len(session.context.pages) >= 3
# Perform a series of rapid tab switches to simulate race conditions
for i in range(5):
tab_index = i % 3
await browser_context.switch_to_tab(tab_index)
await asyncio.sleep(0.1) # Very short delay between switches
# Verify the state is consistent after rapid operations
assert browser_context.human_current_page is not None
assert browser_context.agent_current_page is not None
assert browser_context.human_current_page == browser_context.agent_current_page
assert not browser_context.human_current_page.is_closed()
# Verify we can still navigate on the final tab
page = await browser_context.get_agent_current_page()
await page.goto(f'{base_url}/page4')
assert f'{base_url}/page4' in page.url
@pytest.mark.asyncio
async def test_tab_management_using_controller_actions(self, browser_context, controller, base_url):
"""
Test tab management using Controller actions instead of directly calling browser_context methods,
ensuring that both human and agent tab detection works correctly.
"""
# Ensure we start with at least one tab
await self._ensure_synchronized_state(browser_context, base_url)
# Make sure we have a clean single tab to start with
session = await browser_context.get_session()
while len(session.context.pages) > 1:
await browser_context.close_current_tab()
await asyncio.sleep(0.5)
# Store the initial tab for reference
initial_tab = browser_context.agent_current_page
initial_tab_id = initial_tab.page_id if hasattr(initial_tab, 'page_id') else 0
# Define action models for tab operations
class OpenTabActionModel(ActionModel):
open_tab: OpenTabAction | None = None
class SwitchTabActionModel(ActionModel):
switch_tab: SwitchTabAction | None = None
class GoToUrlActionModel(ActionModel):
go_to_url: GoToUrlAction | None = None
class CloseTabActionModel(ActionModel):
close_tab: CloseTabAction | None = None
# Create second tab with OpenTabAction
open_tab_action = {'open_tab': OpenTabAction(url=f'{base_url}/page2')}
await controller.act(OpenTabActionModel(**open_tab_action), browser_context)
await asyncio.sleep(1) # Wait for the tab to fully initialize
# Verify the second tab is opened and active for both agent and human
second_tab = browser_context.agent_current_page
assert browser_context.human_current_page == browser_context.agent_current_page
assert f'{base_url}/page2' in browser_context.agent_current_page.url
second_tab_id = second_tab.page_id if hasattr(second_tab, 'page_id') else 1
# Create third tab with OpenTabAction
open_tab_action2 = {'open_tab': OpenTabAction(url=f'{base_url}/page3')}
await controller.act(OpenTabActionModel(**open_tab_action2), browser_context)
await asyncio.sleep(1) # Wait for the tab to fully initialize
# Verify the third tab is opened and active
third_tab = browser_context.agent_current_page
assert browser_context.human_current_page == browser_context.agent_current_page
assert f'{base_url}/page3' in browser_context.agent_current_page.url
third_tab_id = third_tab.page_id if hasattr(third_tab, 'page_id') else 2
# Use SwitchTabAction to go back to the first tab (for the agent)
switch_tab_action = {'switch_tab': SwitchTabAction(page_id=initial_tab_id)}
await controller.act(SwitchTabActionModel(**switch_tab_action), browser_context)
await asyncio.sleep(0.5)
# Verify agent is now on the first tab
assert browser_context.agent_current_page == initial_tab
assert f'{base_url}/page1' in browser_context.agent_current_page.url
assert browser_context.human_current_page == browser_context.agent_current_page
# Simulate human switching to the second tab
await self._simulate_user_tab_change(second_tab, browser_context)
await asyncio.sleep(0.5)
# Verify human and agent are on different tabs
assert browser_context.human_current_page == second_tab
assert browser_context.agent_current_page == initial_tab
assert browser_context.human_current_page != browser_context.agent_current_page
assert f'{base_url}/page2' in browser_context.human_current_page.url
assert f'{base_url}/page1' in browser_context.agent_current_page.url
# Use GoToUrlAction to navigate the agent's tab to a new URL
goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/page4')}
await controller.act(GoToUrlActionModel(**goto_action), browser_context)
await asyncio.sleep(0.5)
# Refresh the agent's page reference and verify navigation
agent_page = await browser_context.get_agent_current_page()
assert agent_page is not None
assert f'{base_url}/page4' in agent_page.url
# Verify human's tab remains unchanged
assert f'{base_url}/page2' in browser_context.human_current_page.url
# Use CloseTabAction to close the third tab
close_tab_action = {'close_tab': CloseTabAction(page_id=third_tab_id)}
await controller.act(CloseTabActionModel(**close_tab_action), browser_context)
await asyncio.sleep(1.0) # Extended wait to ensure tab cleanup
# Verify tab was closed
session = await browser_context.get_session()
assert len(session.context.pages) == 2
# Close the second tab, which is the human's current tab
close_tab_action2 = {'close_tab': CloseTabAction(page_id=second_tab_id)}
await controller.act(CloseTabActionModel(**close_tab_action2), browser_context)
await asyncio.sleep(1.0) # Extended wait to ensure tab cleanup
# Verify we have only one tab left
session = await browser_context.get_session()
assert len(session.context.pages) == 1
# Refresh references and verify both human and agent point to the same tab
await browser_context._reconcile_tab_state()
assert browser_context.human_current_page is not None
assert browser_context.agent_current_page is not None
assert browser_context.human_current_page == browser_context.agent_current_page
# Verify the URL of the remaining tab
final_page = await browser_context.get_current_page()
assert f'{base_url}' in final_page.url

View file

@ -0,0 +1,91 @@
from browser_use.browser.context import BrowserContext, BrowserContextConfig
class TestUrlAllowlistSecurity:
"""Tests for URL allowlist security bypass prevention and URL allowlist glob pattern matching."""
def test_authentication_bypass_prevention(self):
"""Test that the URL allowlist cannot be bypassed using authentication credentials."""
# Create a context config with a sample allowed domain
config = BrowserContextConfig(allowed_domains=['example.com'])
context = BrowserContext(browser=None, config=config)
# Security vulnerability test cases
# These should all be detected as malicious despite containing "example.com"
assert context._is_url_allowed('https://example.com:password@malicious.com') is False
assert context._is_url_allowed('https://example.com@malicious.com') is False
assert context._is_url_allowed('https://example.com%20@malicious.com') is False
assert context._is_url_allowed('https://example.com%3A@malicious.com') is False
# Make sure legitimate auth credentials still work
assert context._is_url_allowed('https://user:password@example.com') is True
def test_glob_pattern_matching(self):
"""Test that glob patterns in allowed_domains work correctly."""
# Test *.example.com pattern (should match subdomains and main domain)
glob_config = BrowserContextConfig(allowed_domains=['*.example.com'])
glob_context = BrowserContext(browser=None, config=glob_config)
# Should match subdomains
assert glob_context._is_url_allowed('https://sub.example.com') is True
assert glob_context._is_url_allowed('https://deep.sub.example.com') is True
# Should also match main domain
assert glob_context._is_url_allowed('https://example.com') is True
# Should not match other domains
assert glob_context._is_url_allowed('https://notexample.com') is False
assert glob_context._is_url_allowed('https://example.org') is False
# Test more complex glob patterns
stars_config = BrowserContextConfig(allowed_domains=['*google.com', 'wiki*'])
stars_context = BrowserContext(browser=None, config=stars_config)
# Should match domains ending with google.com
assert stars_context._is_url_allowed('https://google.com') is True
assert stars_context._is_url_allowed('https://www.google.com') is True
assert stars_context._is_url_allowed('https://anygoogle.com') is True
# Should match domains starting with wiki
assert stars_context._is_url_allowed('https://wiki.org') is True
assert stars_context._is_url_allowed('https://wikipedia.org') is True
# Should not match other domains
assert stars_context._is_url_allowed('https://example.com') is False
# Test browser internal URLs
assert stars_context._is_url_allowed('chrome://settings') is True
assert stars_context._is_url_allowed('about:blank') is True
# Test security for glob patterns (authentication credentials bypass attempts)
# These should all be detected as malicious despite containing allowed domain patterns
assert glob_context._is_url_allowed('https://allowed.example.com:password@notallowed.com') is False
assert glob_context._is_url_allowed('https://subdomain.example.com@evil.com') is False
assert glob_context._is_url_allowed('https://sub.example.com%20@malicious.org') is False
assert stars_context._is_url_allowed('https://anygoogle.com@evil.org') is False
def test_glob_pattern_edge_cases(self):
"""Test edge cases for glob pattern matching to ensure proper behavior."""
# Test with domains containing glob pattern in the middle
stars_config = BrowserContextConfig(allowed_domains=['*google.com', 'wiki*'])
stars_context = BrowserContext(browser=None, config=stars_config)
# Verify that 'wiki*' pattern doesn't match domains that merely contain 'wiki' in the middle
assert stars_context._is_url_allowed('https://notawiki.com') is False
assert stars_context._is_url_allowed('https://havewikipages.org') is False
assert stars_context._is_url_allowed('https://my-wiki-site.com') is False
# Verify that '*google.com' doesn't match domains that have 'google' in the middle
assert stars_context._is_url_allowed('https://mygoogle.company.com') is False
# Create context with potentially risky glob pattern that demonstrates security concerns
risky_config = BrowserContextConfig(allowed_domains=['*.google.*'])
risky_context = BrowserContext(browser=None, config=risky_config)
# Should match legitimate Google domains
assert risky_context._is_url_allowed('https://www.google.com') is True
assert risky_context._is_url_allowed('https://mail.google.co.uk') is True
# But could also match potentially malicious domains with a subdomain structure
# This demonstrates why such wildcard patterns can be risky
assert risky_context._is_url_allowed('https://www.google.evil.com') is True

View file

@ -0,0 +1,64 @@
"""
Simple try of the agent.
@dev You need to add OPENAI_API_KEY to your environment variables.
"""
import os
import sys
from pprint import pprint
import pytest
from browser_use.browser.browser import Browser, BrowserConfig
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import asyncio
from langchain_openai import ChatOpenAI
from browser_use import Agent, AgentHistoryList, Controller
llm = ChatOpenAI(model='gpt-4o')
controller = Controller()
# use this test to ask the model questions about the page like
# which color do you see for bbox labels, list all with their label
# what's the smallest bboxes with labels and
@controller.registry.action(description='explain what you see on the screen and ask user for input')
async def explain_screen(text: str) -> str:
pprint(text)
answer = input('\nuser input next question: \n')
return answer
@controller.registry.action(description='done')
async def done(text: str) -> str:
# pprint(text)
return 'call explain_screen'
@pytest.fixture(scope='function')
def event_loop():
"""Create an instance of the default event loop for each test case."""
loop = asyncio.get_event_loop_policy().new_event_loop()
yield loop
loop.close()
@pytest.mark.skip(reason='this is for local testing only')
async def test_vision():
agent = Agent(
task='call explain_screen all the time the user asks you questions e.g. about the page like bbox which you see are labels - your task is to explain it and get the next question',
llm=llm,
controller=controller,
browser=Browser(config=BrowserConfig(disable_security=True, headless=False)),
)
try:
history: AgentHistoryList = await agent.run(20)
finally:
# Make sure to close the browser
await agent.browser.close()

View file

@ -0,0 +1,68 @@
import asyncio
import os
import sys
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
if project_root not in sys.path:
sys.path.insert(0, project_root)
import pytest
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
# Third-party imports
from browser_use import Agent, Controller
# Local imports
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import BrowserContext
# Load environment variables.
load_dotenv()
# Initialize language model and controller.
llm = ChatOpenAI(model='gpt-4o')
controller = Controller()
@pytest.mark.skip(reason='this is for local testing only')
async def test_wait_for_element():
"""Test 'Wait for element' action."""
initial_actions = [
{'open_tab': {'url': 'https://pypi.org/'}},
# Uncomment the line below to include the wait action in initial actions.
# {'wait_for_element': {'selector': '#search', 'timeout': 30}},
]
# Set up the browser context.
context = BrowserContext(
browser=Browser(config=BrowserConfig(headless=False, disable_security=True)),
)
# Create the agent with the task.
agent = Agent(
task="Wait for element '#search' to be visible with a timeout of 30 seconds.",
llm=llm,
browser_context=context,
initial_actions=initial_actions,
controller=controller,
)
# Run the agent for a few steps to trigger navigation and then the wait action.
history = await agent.run(max_steps=3)
action_names = history.action_names()
# Ensure that the wait_for_element action was executed.
assert 'wait_for_element' in action_names, 'Expected wait_for_element action to be executed.'
# Verify that the #search element is visible by querying the page.
page = await context.get_current_page()
header_handle = await page.query_selector('#search')
assert header_handle is not None, 'Expected to find a #search element on the page.'
is_visible = await header_handle.is_visible()
assert is_visible, 'Expected the #search element to be visible.'
if __name__ == '__main__':
asyncio.run(test_wait_for_element())