[Add] browser-use and main.py
This commit is contained in:
parent
08e64bdf45
commit
96914d44ac
221 changed files with 30952 additions and 1 deletions
17
browser-use/docs/README.md
Normal file
17
browser-use/docs/README.md
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
# Docs
|
||||
|
||||
The official documentation for Browser Use. The docs are published to [Browser Use Docs](https://docs.browser-use.com).
|
||||
|
||||
### Development
|
||||
|
||||
Install the [Mintlify CLI](https://www.npmjs.com/package/mintlify) to preview the documentation changes locally. To install, use the following command
|
||||
|
||||
```
|
||||
npm i -g mintlify
|
||||
```
|
||||
|
||||
Run the following command at the root of your documentation (where mint.json is)
|
||||
|
||||
```
|
||||
mintlify dev
|
||||
```
|
||||
198
browser-use/docs/cloud/implementation.mdx
Normal file
198
browser-use/docs/cloud/implementation.mdx
Normal file
|
|
@ -0,0 +1,198 @@
|
|||
---
|
||||
title: "Implementing the API"
|
||||
description: "Learn how to implement the Browser Use API in Python"
|
||||
icon: "code"
|
||||
---
|
||||
|
||||
This guide shows how to implement common API patterns using Python. We'll create a complete example that creates and monitors a browser automation task.
|
||||
|
||||
## Basic Implementation
|
||||
|
||||
For all settings see [Run Task](cloud/api-v10/run-task).
|
||||
|
||||
Here's a simple implementation using Python's `requests` library to stream the task steps:
|
||||
|
||||
```python
|
||||
import json
|
||||
import time
|
||||
|
||||
import requests
|
||||
|
||||
API_KEY = 'your_api_key_here'
|
||||
BASE_URL = 'https://api.browser-use.com/api/v1'
|
||||
HEADERS = {'Authorization': f'Bearer {API_KEY}'}
|
||||
|
||||
|
||||
def create_task(instructions: str):
|
||||
"""Create a new browser automation task"""
|
||||
response = requests.post(f'{BASE_URL}/run-task', headers=HEADERS, json={'task': instructions})
|
||||
return response.json()['id']
|
||||
|
||||
|
||||
def get_task_status(task_id: str):
|
||||
"""Get current task status"""
|
||||
response = requests.get(f'{BASE_URL}/task/{task_id}/status', headers=HEADERS)
|
||||
return response.json()
|
||||
|
||||
|
||||
def get_task_details(task_id: str):
|
||||
"""Get full task details including output"""
|
||||
response = requests.get(f'{BASE_URL}/task/{task_id}', headers=HEADERS)
|
||||
return response.json()
|
||||
|
||||
|
||||
def wait_for_completion(task_id: str, poll_interval: int = 2):
|
||||
"""Poll task status until completion"""
|
||||
count = 0
|
||||
unique_steps = []
|
||||
while True:
|
||||
details = get_task_details(task_id)
|
||||
new_steps = details['steps']
|
||||
# use only the new steps that are not in unique_steps.
|
||||
if new_steps != unique_steps:
|
||||
for step in new_steps:
|
||||
if step not in unique_steps:
|
||||
print(json.dumps(step, indent=4))
|
||||
unique_steps = new_steps
|
||||
count += 1
|
||||
status = details['status']
|
||||
|
||||
if status in ['finished', 'failed', 'stopped']:
|
||||
return details
|
||||
time.sleep(poll_interval)
|
||||
|
||||
|
||||
def main():
|
||||
task_id = create_task('Open https://www.google.com and search for openai')
|
||||
print(f'Task created with ID: {task_id}')
|
||||
task_details = wait_for_completion(task_id)
|
||||
print(f"Final output: {task_details['output']}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
```
|
||||
|
||||
## Task Control Example
|
||||
|
||||
Here's how to implement task control with pause/resume functionality:
|
||||
|
||||
```python
|
||||
def control_task():
|
||||
# Create a new task
|
||||
task_id = create_task("Go to google.com and search for Browser Use")
|
||||
|
||||
# Wait for 5 seconds
|
||||
time.sleep(5)
|
||||
|
||||
# Pause the task
|
||||
requests.put(f"{BASE_URL}/pause-task?task_id={task_id}", headers=HEADERS)
|
||||
print("Task paused! Check the live preview.")
|
||||
|
||||
# Wait for user input
|
||||
input("Press Enter to resume...")
|
||||
|
||||
# Resume the task
|
||||
requests.put(f"{BASE_URL}/resume-task?task_id={task_id}", headers=HEADERS)
|
||||
|
||||
# Wait for completion
|
||||
result = wait_for_completion(task_id)
|
||||
print(f"Task completed with output: {result['output']}")
|
||||
```
|
||||
|
||||
## Structured Output Example
|
||||
|
||||
Here's how to implement a task with structured JSON output:
|
||||
|
||||
```python
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import requests
|
||||
from pydantic import BaseModel
|
||||
from typing import List
|
||||
|
||||
|
||||
API_KEY = os.getenv("API_KEY")
|
||||
BASE_URL = 'https://api.browser-use.com/api/v1'
|
||||
HEADERS = {
|
||||
"Authorization": f"Bearer {API_KEY}",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
|
||||
# Define output schema using Pydantic
|
||||
class SocialMediaCompany(BaseModel):
|
||||
name: str
|
||||
market_cap: float
|
||||
headquarters: str
|
||||
founded_year: int
|
||||
|
||||
|
||||
class SocialMediaCompanies(BaseModel):
|
||||
companies: List[SocialMediaCompany]
|
||||
|
||||
|
||||
def create_structured_task(instructions: str, schema: dict):
|
||||
"""Create a task that expects structured output"""
|
||||
payload = {
|
||||
"task": instructions,
|
||||
"structured_output_json": json.dumps(schema)
|
||||
}
|
||||
response = requests.post(f"{BASE_URL}/run-task", headers=HEADERS, json=payload)
|
||||
response.raise_for_status()
|
||||
return response.json()["id"]
|
||||
|
||||
|
||||
def wait_for_task_completion(task_id: str, poll_interval: int = 5):
|
||||
"""Poll task status until it completes"""
|
||||
while True:
|
||||
response = requests.get(f"{BASE_URL}/task/{task_id}/status", headers=HEADERS)
|
||||
response.raise_for_status()
|
||||
status = response.json()
|
||||
if status == "finished":
|
||||
break
|
||||
elif status in ["failed", "stopped"]:
|
||||
raise RuntimeError(f"Task {task_id} ended with status: {status}")
|
||||
print("Waiting for task to finish...")
|
||||
time.sleep(poll_interval)
|
||||
|
||||
|
||||
def fetch_task_output(task_id: str):
|
||||
"""Retrieve the final task result"""
|
||||
response = requests.get(f"{BASE_URL}/task/{task_id}", headers=HEADERS)
|
||||
response.raise_for_status()
|
||||
return response.json()["output"]
|
||||
|
||||
|
||||
def main():
|
||||
schema = SocialMediaCompanies.model_json_schema()
|
||||
task_id = create_structured_task(
|
||||
"Get me the top social media companies by market cap",
|
||||
schema
|
||||
)
|
||||
print(f"Task created with ID: {task_id}")
|
||||
|
||||
wait_for_task_completion(task_id)
|
||||
print("Task completed!")
|
||||
|
||||
output = fetch_task_output(task_id)
|
||||
print("Raw output:", output)
|
||||
|
||||
try:
|
||||
parsed = SocialMediaCompanies.model_validate_json(output)
|
||||
print("Parsed output:")
|
||||
print(parsed)
|
||||
except Exception as e:
|
||||
print(f"Failed to parse structured output: {e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
```
|
||||
|
||||
<Note>
|
||||
Remember to handle your API key securely and implement proper error handling
|
||||
in production code.
|
||||
</Note>
|
||||
124
browser-use/docs/cloud/quickstart.mdx
Normal file
124
browser-use/docs/cloud/quickstart.mdx
Normal file
|
|
@ -0,0 +1,124 @@
|
|||
---
|
||||
title: "Quickstart"
|
||||
description: "Learn how to get started with the Browser Use Cloud API"
|
||||
icon: "cloud"
|
||||
---
|
||||
|
||||
The Browser Use Cloud API lets you create and manage browser automation agents programmatically. Each agent can execute tasks and provide real-time feedback through a live preview URL.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
<Note>
|
||||
You need an active subscription and an API key from
|
||||
[cloud.browser-use.com/billing](https://cloud.browser-use.com/billing)
|
||||
</Note>
|
||||
|
||||
## Pricing
|
||||
|
||||
The Browser Use Cloud API pricing consists of two components:
|
||||
|
||||
1. **Task Initialization Cost**: $0.01 per started task
|
||||
2. **Task Step Cost**: Additional cost based on the specific model used for each step
|
||||
|
||||
### LLM Model Step Pricing
|
||||
|
||||
The following table shows the total cost per step for each available LLM model:
|
||||
|
||||
| Model | Cost per Step |
|
||||
| ------------------------------ | ------------- |
|
||||
| GPT-4o | $0.03 |
|
||||
| GPT-4.1 | $0.03 |
|
||||
| Claude 3.7 Sonnet (2025-02-19) | $0.03 |
|
||||
| GPT-4o mini | $0.01 |
|
||||
| GPT-4.1 mini | $0.01 |
|
||||
| Gemini 2.0 Flash | $0.01 |
|
||||
| Gemini 2.0 Flash Lite | $0.01 |
|
||||
| Llama 4 Maverick | $0.01 |
|
||||
|
||||
### Example Cost Calculation
|
||||
|
||||
For example, using GPT-4o for a 10 step task:
|
||||
|
||||
- Task initialization: $0.01
|
||||
- 10 steps × $0.03 per step
|
||||
- Total cost: $0.31
|
||||
|
||||
## Creating Your First Agent
|
||||
|
||||
Create a new browser automation task by providing instructions in natural language:
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.browser-use.com/api/v1/run-task \
|
||||
-H "Authorization: Bearer your_api_key_here" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"task": "Go to google.com and search for Browser Use"
|
||||
}'
|
||||
```
|
||||
|
||||
The API returns a task ID that you can use to manage the task and check the live preview URL.
|
||||
|
||||
<Note>
|
||||
The task response includes a `live_url` that you can embed in an iframe to
|
||||
watch and control the agent in real-time.
|
||||
</Note>
|
||||
|
||||
## Managing Tasks
|
||||
|
||||
Control running tasks with these operations:
|
||||
|
||||
<AccordionGroup>
|
||||
<Accordion title="Pause/Resume Tasks">
|
||||
Temporarily pause task execution with [`/api/v1/pause-task`](/cloud/api-v1/pause-task) and resume with
|
||||
[`/api/v1/resume-task`](/cloud/api-v1/resume-task). Useful for manual inspection or intervention.
|
||||
</Accordion>
|
||||
|
||||
<Accordion title="Stop Tasks">
|
||||
Permanently stop a task using [`/api/v1/stop-task`](/cloud/api-v1/stop-task). The task cannot be
|
||||
resumed after being stopped.
|
||||
</Accordion>
|
||||
</AccordionGroup>
|
||||
|
||||
For detailed API documentation, see the tabs on the left, which include the full coverage of the API.
|
||||
|
||||
## Building your own client (OpenAPI)
|
||||
|
||||
<Note>
|
||||
We recommend this only if you don't need control and only need to run simple
|
||||
tasks.
|
||||
</Note>
|
||||
|
||||
The best way to build your own client is to use our [OpenAPI specification](http://api.browser-use.com/openapi.json) to generate a type-safe client library.
|
||||
|
||||
### Python
|
||||
|
||||
Use [openapi-python-client](https://github.com/openapi-generators/openapi-python-client) to generate a modern Python client:
|
||||
|
||||
```bash
|
||||
# Install the generator
|
||||
pipx install openapi-python-client --include-deps
|
||||
|
||||
# Generate the client
|
||||
openapi-python-client generate --url http://api.browser-use.com/openapi.json
|
||||
```
|
||||
|
||||
This will create a Python package with full type hints, modern dataclasses, and async support.
|
||||
|
||||
### TypeScript/JavaScript
|
||||
|
||||
For TypeScript projects, use [openapi-typescript](https://www.npmjs.com/package/openapi-typescript) to generate type definitions:
|
||||
|
||||
```bash
|
||||
# Install the generator
|
||||
npm install -D openapi-typescript
|
||||
|
||||
# Generate the types
|
||||
npx openapi-typescript http://api.browser-use.com/openapi.json -o browser-use-api.ts
|
||||
```
|
||||
|
||||
This will create TypeScript definitions you can use with your preferred HTTP client.
|
||||
|
||||
<Note>
|
||||
Need help? Contact our support team at support@browser-use.com or join our
|
||||
[Discord community](https://link.browser-use.com/discord)
|
||||
</Note>
|
||||
334
browser-use/docs/customize/agent-settings.mdx
Normal file
334
browser-use/docs/customize/agent-settings.mdx
Normal file
|
|
@ -0,0 +1,334 @@
|
|||
---
|
||||
title: "Agent Settings"
|
||||
description: "Learn how to configure the agent"
|
||||
icon: "gear"
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
The `Agent` class is the core component of Browser Use that handles browser automation. Here are the main configuration options you can use when initializing an agent.
|
||||
|
||||
## Basic Settings
|
||||
|
||||
```python
|
||||
from browser_use import Agent
|
||||
from langchain_openai import ChatOpenAI
|
||||
|
||||
agent = Agent(
|
||||
task="Search for latest news about AI",
|
||||
llm=ChatOpenAI(model="gpt-4o"),
|
||||
)
|
||||
```
|
||||
|
||||
### Required Parameters
|
||||
|
||||
- `task`: The instruction for the agent to execute
|
||||
- `llm`: A LangChain chat model instance. See <a href="/customize/supported-models">LangChain Models</a> for supported models.
|
||||
|
||||
## Agent Behavior
|
||||
|
||||
Control how the agent operates:
|
||||
|
||||
```python
|
||||
agent = Agent(
|
||||
task="your task",
|
||||
llm=llm,
|
||||
controller=custom_controller, # For custom tool calling
|
||||
use_vision=True, # Enable vision capabilities
|
||||
save_conversation_path="logs/conversation" # Save chat logs
|
||||
)
|
||||
```
|
||||
|
||||
### Behavior Parameters
|
||||
|
||||
- `controller`: Registry of functions the agent can call. Defaults to base Controller. See <a href="/customize/custom-functions">Custom Functions</a> for details.
|
||||
- `use_vision`: Enable/disable vision capabilities. Defaults to `True`.
|
||||
- When enabled, the model processes visual information from web pages
|
||||
- Disable to reduce costs or use models without vision support
|
||||
- For GPT-4o, image processing costs approximately 800-1000 tokens (~$0.002 USD) per image (but this depends on the defined screen size)
|
||||
- `save_conversation_path`: Path to save the complete conversation history. Useful for debugging.
|
||||
- `override_system_message`: Completely replace the default system prompt with a custom one.
|
||||
- `extend_system_message`: Add additional instructions to the default system prompt.
|
||||
|
||||
<Note>
|
||||
Vision capabilities are recommended for better web interaction understanding,
|
||||
but can be disabled to reduce costs or when using models without vision
|
||||
support.
|
||||
</Note>
|
||||
|
||||
## (Reuse) Browser Configuration
|
||||
|
||||
You can configure how the agent interacts with the browser. To see more `Browser` options refer to the <a href="/customize/browser-settings">Browser Settings</a> documentation.
|
||||
|
||||
### Reuse Existing Browser
|
||||
|
||||
`browser`: A Browser Use Browser instance. When provided, the agent will reuse this browser instance and automatically create new contexts for each `run()`.
|
||||
|
||||
```python
|
||||
from browser_use import Agent, Browser
|
||||
from browser_use.browser.context import BrowserContext
|
||||
|
||||
# Reuse existing browser
|
||||
browser = Browser()
|
||||
agent = Agent(
|
||||
task=task1,
|
||||
llm=llm,
|
||||
browser=browser # Browser instance will be reused
|
||||
)
|
||||
|
||||
await agent.run()
|
||||
|
||||
# Manually close the browser
|
||||
await browser.close()
|
||||
```
|
||||
|
||||
<Note>
|
||||
Remember: in this scenario the `Browser` will not be closed automatically.
|
||||
</Note>
|
||||
|
||||
### Reuse Existing Browser Context
|
||||
|
||||
`browser_context`: A Playwright browser context. Useful for maintaining persistent sessions. See <a href="/customize/persistent-browser">Persistent Browser</a> for more details.
|
||||
|
||||
```python
|
||||
from browser_use import Agent, Browser
|
||||
from playwright.async_api import BrowserContext
|
||||
|
||||
# Use specific browser context (preferred method)
|
||||
async with await browser.new_context() as context:
|
||||
agent = Agent(
|
||||
task=task2,
|
||||
llm=llm,
|
||||
browser_context=context # Use persistent context
|
||||
)
|
||||
|
||||
# Run the agent
|
||||
await agent.run()
|
||||
|
||||
# Pass the context to the next agent
|
||||
next_agent = Agent(
|
||||
task=task2,
|
||||
llm=llm,
|
||||
browser_context=context
|
||||
)
|
||||
|
||||
...
|
||||
|
||||
await browser.close()
|
||||
```
|
||||
|
||||
For more information about how browser context works, refer to the [Playwright
|
||||
documentation](https://playwright.dev/docs/api/class-browsercontext).
|
||||
|
||||
<Note>
|
||||
You can reuse the same context for multiple agents. If you do nothing, the
|
||||
browser will be automatically created and closed on `run()` completion.
|
||||
</Note>
|
||||
|
||||
## Running the Agent
|
||||
|
||||
The agent is executed using the async `run()` method:
|
||||
|
||||
- `max_steps` (default: `100`)
|
||||
Maximum number of steps the agent can take during execution. This prevents infinite loops and helps control execution time.
|
||||
|
||||
## Agent History
|
||||
|
||||
The method returns an `AgentHistoryList` object containing the complete execution history. This history is invaluable for debugging, analysis, and creating reproducible scripts.
|
||||
|
||||
```python
|
||||
# Example of accessing history
|
||||
history = await agent.run()
|
||||
|
||||
# Access (some) useful information
|
||||
history.urls() # List of visited URLs
|
||||
history.screenshots() # List of screenshot paths
|
||||
history.action_names() # Names of executed actions
|
||||
history.extracted_content() # Content extracted during execution
|
||||
history.errors() # Any errors that occurred
|
||||
history.model_actions() # All actions with their parameters
|
||||
```
|
||||
|
||||
The `AgentHistoryList` provides many helper methods to analyze the execution:
|
||||
|
||||
- `final_result()`: Get the final extracted content
|
||||
- `is_done()`: Check if the agent completed successfully
|
||||
- `has_errors()`: Check if any errors occurred
|
||||
- `model_thoughts()`: Get the agent's reasoning process
|
||||
- `action_results()`: Get results of all actions
|
||||
|
||||
<Note>
|
||||
For a complete list of helper methods and detailed history analysis
|
||||
capabilities, refer to the [AgentHistoryList source
|
||||
code](https://github.com/browser-use/browser-use/blob/main/browser_use/agent/views.py#L111).
|
||||
</Note>
|
||||
|
||||
## Run initial actions without LLM
|
||||
With [this example](https://github.com/browser-use/browser-use/blob/main/examples/features/initial_actions.py) you can run initial actions without the LLM.
|
||||
Specify the action as a dictionary where the key is the action name and the value is the action parameters. You can find all our actions in the [Controller](https://github.com/browser-use/browser-use/blob/main/browser_use/controller/service.py) source code.
|
||||
```python
|
||||
|
||||
initial_actions = [
|
||||
{'open_tab': {'url': 'https://www.google.com'}},
|
||||
{'open_tab': {'url': 'https://en.wikipedia.org/wiki/Randomness'}},
|
||||
{'scroll_down': {'amount': 1000}},
|
||||
]
|
||||
agent = Agent(
|
||||
task='What theories are displayed on the page?',
|
||||
initial_actions=initial_actions,
|
||||
llm=llm,
|
||||
)
|
||||
```
|
||||
|
||||
## Run with message context
|
||||
|
||||
You can configure the agent and provide a separate message to help the LLM understand the task better.
|
||||
|
||||
```python
|
||||
from langchain_openai import ChatOpenAI
|
||||
|
||||
agent = Agent(
|
||||
task="your task",
|
||||
message_context="Additional information about the task",
|
||||
llm = ChatOpenAI(model='gpt-4o')
|
||||
)
|
||||
```
|
||||
|
||||
## Run with planner model
|
||||
|
||||
You can configure the agent to use a separate planner model for high-level task planning:
|
||||
|
||||
```python
|
||||
from langchain_openai import ChatOpenAI
|
||||
|
||||
# Initialize models
|
||||
llm = ChatOpenAI(model='gpt-4o')
|
||||
planner_llm = ChatOpenAI(model='o3-mini')
|
||||
|
||||
agent = Agent(
|
||||
task="your task",
|
||||
llm=llm,
|
||||
planner_llm=planner_llm, # Separate model for planning
|
||||
use_vision_for_planner=False, # Disable vision for planner
|
||||
planner_interval=4 # Plan every 4 steps
|
||||
)
|
||||
```
|
||||
|
||||
### Planner Parameters
|
||||
|
||||
- `planner_llm`: A LangChain chat model instance used for high-level task planning. Can be a smaller/cheaper model than the main LLM.
|
||||
- `use_vision_for_planner`: Enable/disable vision capabilities for the planner model. Defaults to `True`.
|
||||
- `planner_interval`: Number of steps between planning phases. Defaults to `1`.
|
||||
|
||||
Using a separate planner model can help:
|
||||
- Reduce costs by using a smaller model for high-level planning
|
||||
- Improve task decomposition and strategic thinking
|
||||
- Better handle complex, multi-step tasks
|
||||
|
||||
<Note>
|
||||
The planner model is optional. If not specified, the agent will not use the planner model.
|
||||
</Note>
|
||||
|
||||
### Optional Parameters
|
||||
|
||||
- `message_context`: Additional information about the task to help the LLM understand the task better.
|
||||
- `initial_actions`: List of initial actions to run before the main task.
|
||||
- `max_actions_per_step`: Maximum number of actions to run in a step. Defaults to `10`.
|
||||
- `max_failures`: Maximum number of failures before giving up. Defaults to `3`.
|
||||
- `retry_delay`: Time to wait between retries in seconds when rate limited. Defaults to `10`.
|
||||
- `generate_gif`: Enable/disable GIF generation. Defaults to `False`. Set to `True` or a string path to save the GIF.
|
||||
## Memory Management
|
||||
|
||||
Browser Use includes a procedural memory system using [Mem0](https://mem0.ai) that automatically summarizes the agent's conversation history at regular intervals to optimize context window usage during long tasks.
|
||||
|
||||
```python
|
||||
from browser_use.agent.memory import MemoryConfig
|
||||
|
||||
agent = Agent(
|
||||
task="your task",
|
||||
llm=llm,
|
||||
enable_memory=True,
|
||||
memory_config=MemoryConfig(
|
||||
agent_id="my_custom_agent",
|
||||
memory_interval=15
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
### Memory Parameters
|
||||
|
||||
- `enable_memory`: Enable/disable the procedural memory system. Defaults to `True`.
|
||||
- `memory_config`: A `MemoryConfig` Pydantic model instance (required). Dictionary format is not supported.
|
||||
|
||||
### Using MemoryConfig
|
||||
|
||||
You must configure the memory system using the `MemoryConfig` Pydantic model for a type-safe approach:
|
||||
|
||||
```python
|
||||
from browser_use.agent.memory import MemoryConfig
|
||||
|
||||
agent = Agent(
|
||||
task=task_description,
|
||||
llm=llm,
|
||||
memory_config=MemoryConfig(
|
||||
agent_id="my_agent",
|
||||
memory_interval=15,
|
||||
embedder_provider="openai",
|
||||
embedder_model="text-embedding-3-large",
|
||||
embedder_dims=1536,
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
The `MemoryConfig` model provides these configuration options:
|
||||
|
||||
#### Memory Settings
|
||||
- `agent_id`: Unique identifier for the agent (default: `"browser_use_agent"`)
|
||||
- `memory_interval`: Number of steps between memory summarization (default: `10`)
|
||||
|
||||
#### Embedder Settings
|
||||
- `embedder_provider`: Provider for embeddings (`'openai'`, `'gemini'`, `'ollama'`, or `'huggingface'`)
|
||||
- `embedder_model`: Model name for the embedder
|
||||
- `embedder_dims`: Dimensions for the embeddings
|
||||
|
||||
#### Vector Store Settings
|
||||
- `vector_store_provider`: Provider for vector storage (currently only `'faiss'` is supported)
|
||||
- `vector_store_base_path`: Path for storing vector data (e.g. /tmp/mem0)
|
||||
|
||||
The model automatically sets appropriate defaults based on the LLM being used:
|
||||
- For `ChatOpenAI`: Uses OpenAI's `text-embedding-3-small` embeddings
|
||||
- For `ChatGoogleGenerativeAI`: Uses Gemini's `models/text-embedding-004` embeddings
|
||||
- For `ChatOllama`: Uses Ollama's `nomic-embed-text` embeddings
|
||||
- Default: Uses Hugging Face's `all-MiniLM-L6-v2` embeddings
|
||||
|
||||
<Note>
|
||||
Always pass a properly constructed `MemoryConfig` object to the `memory_config` parameter.
|
||||
Dictionary-based configuration is no longer supported.
|
||||
</Note>
|
||||
|
||||
### How Memory Works
|
||||
|
||||
When enabled, the agent periodically compresses its conversation history into concise summaries:
|
||||
|
||||
1. Every `memory_interval` steps, the agent reviews its recent interactions
|
||||
2. It creates a procedural memory summary using the same LLM as the agent
|
||||
3. The original messages are replaced with the summary, reducing token usage
|
||||
4. This process helps maintain important context while freeing up the context window
|
||||
|
||||
### Disabling Memory
|
||||
|
||||
If you want to disable the memory system (for debugging or for shorter tasks), set `enable_memory` to `False`:
|
||||
|
||||
```python
|
||||
agent = Agent(
|
||||
task="your task",
|
||||
llm=llm,
|
||||
enable_memory=False
|
||||
)
|
||||
```
|
||||
|
||||
<Note>
|
||||
Disabling memory may be useful for debugging or short tasks, but for longer
|
||||
tasks, it can lead to context window overflow as the conversation history
|
||||
grows. The memory system helps maintain performance during extended sessions.
|
||||
</Note>
|
||||
202
browser-use/docs/customize/browser-settings.mdx
Normal file
202
browser-use/docs/customize/browser-settings.mdx
Normal file
|
|
@ -0,0 +1,202 @@
|
|||
---
|
||||
title: "Browser Settings"
|
||||
description: "Configure browser behavior and context settings"
|
||||
icon: "globe"
|
||||
---
|
||||
|
||||
Browser Use allows you to customize the browser's behavior through two main configuration classes: `BrowserConfig` and `BrowserContextConfig`. These settings control everything from headless mode to proxy settings and page load behavior.
|
||||
|
||||
<Note>
|
||||
We are currently working on improving how browser contexts are managed. The
|
||||
system will soon transition to a "1 agent, 1 browser, 1 context" model for
|
||||
better stability and developer experience.
|
||||
</Note>
|
||||
|
||||
# Browser Configuration
|
||||
|
||||
The `BrowserConfig` class controls the core browser behavior and connection settings.
|
||||
|
||||
```python
|
||||
from browser_use import BrowserConfig
|
||||
|
||||
# Basic configuration
|
||||
config = BrowserConfig(
|
||||
headless=False,
|
||||
disable_security=False
|
||||
)
|
||||
|
||||
browser = Browser(config=config)
|
||||
|
||||
agent = Agent(
|
||||
browser=browser,
|
||||
# ...
|
||||
)
|
||||
```
|
||||
|
||||
## Core Settings
|
||||
|
||||
- **headless** (default: `False`)
|
||||
Runs the browser without a visible UI. Note that some websites may detect headless mode.
|
||||
|
||||
- **disable_security** (default: `False`)
|
||||
Disables browser security features. While this can fix certain functionality issues (like cross-site iFrames), it should be used cautiously, especially when visiting untrusted websites.
|
||||
|
||||
- **keep_alive** (default: `False`)
|
||||
Keeps the browser alive after the agent has finished running. This is useful when you need to run multiple tasks with the same browser instance.
|
||||
|
||||
### Additional Settings
|
||||
|
||||
- **extra_browser_args** (default: `[]`)
|
||||
Additional arguments are passed to the browser at launch. See the [full list of available arguments](https://github.com/browser-use/browser-use/blob/main/browser_use/browser/browser.py#L180).
|
||||
|
||||
- **proxy** (default: `None`)
|
||||
Standard Playwright proxy settings for using external proxy services.
|
||||
|
||||
- **new_context_config** (default: `BrowserContextConfig()`)
|
||||
Default settings for new browser contexts. See Context Configuration below.
|
||||
|
||||
<Note>
|
||||
For web scraping tasks on sites that restrict automated access, we recommend
|
||||
using external browser or proxy providers for better reliability.
|
||||
</Note>
|
||||
|
||||
## Alternative Initialization
|
||||
|
||||
These settings allow you to connect to external browser providers or use a local Chrome instance.
|
||||
|
||||
### External Browser Provider (wss)
|
||||
|
||||
Connect to cloud-based browser services for enhanced reliability and proxy capabilities.
|
||||
|
||||
```python
|
||||
config = BrowserConfig(
|
||||
wss_url="wss://your-browser-provider.com/ws"
|
||||
)
|
||||
```
|
||||
|
||||
- **wss_url** (default: `None`)
|
||||
WebSocket URL for connecting to external browser providers (e.g., [anchorbrowser.io](https://anchorbrowser.io), steel.dev, browserbase.com, browserless.io, [TestingBot](https://testingbot.com/support/ai/integrations/browser-use)).
|
||||
|
||||
<Note>
|
||||
This overrides local browser settings and uses the provider's configuration.
|
||||
Refer to their documentation for settings.
|
||||
</Note>
|
||||
|
||||
### External Browser Provider (cdp)
|
||||
|
||||
Connect to cloud or local Chrome instances using Chrome DevTools Protocol (CDP) for use with tools like `headless-shell` or `browserless`.
|
||||
|
||||
```python
|
||||
config = BrowserConfig(
|
||||
cdp_url="http://localhost:9222"
|
||||
)
|
||||
```
|
||||
|
||||
- **cdp_url** (default: `None`)
|
||||
URL for connecting to a Chrome instance via CDP. Commonly used for debugging or connecting to locally running Chrome instances.
|
||||
|
||||
### Local Chrome Instance (binary)
|
||||
|
||||
Connect to your existing Chrome installation to access saved states and cookies.
|
||||
|
||||
```python
|
||||
config = BrowserConfig(
|
||||
browser_binary_path="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
|
||||
)
|
||||
```
|
||||
|
||||
- **browser_binary_path** (default: `None`)
|
||||
Path to connect to an existing Browser installation. Particularly useful for workflows requiring existing login states or browser preferences.
|
||||
|
||||
<Note>This will overwrite other browser settings.</Note>
|
||||
|
||||
# Context Configuration
|
||||
|
||||
The `BrowserContextConfig` class controls settings for individual browser contexts.
|
||||
|
||||
```python
|
||||
from browser_use.browser.context import BrowserContextConfig
|
||||
|
||||
config = BrowserContextConfig(
|
||||
cookies_file="path/to/cookies.json",
|
||||
wait_for_network_idle_page_load_time=3.0,
|
||||
window_width=1280,
|
||||
window_height=1100,
|
||||
locale='en-US',
|
||||
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36',
|
||||
highlight_elements=True,
|
||||
viewport_expansion=500,
|
||||
allowed_domains=['google.com', 'wikipedia.org'],
|
||||
)
|
||||
|
||||
browser = Browser()
|
||||
context = BrowserContext(browser=browser, config=config)
|
||||
|
||||
|
||||
async def run_search():
|
||||
agent = Agent(
|
||||
browser_context=context,
|
||||
task='Your task',
|
||||
llm=llm)
|
||||
```
|
||||
|
||||
## Configuration Options
|
||||
|
||||
### Page Load Settings
|
||||
|
||||
- **minimum_wait_page_load_time** (default: `0.5`)
|
||||
Minimum time to wait before capturing page state for LLM input.
|
||||
|
||||
- **wait_for_network_idle_page_load_time** (default: `1.0`)
|
||||
Time to wait for network activity to cease. Increase to 3-5s for slower websites. This tracks essential content loading, not dynamic elements like videos.
|
||||
|
||||
- **maximum_wait_page_load_time** (default: `5.0`)
|
||||
Maximum time to wait for page load before proceeding.
|
||||
|
||||
### Display Settings
|
||||
|
||||
- **window_width** (default: `1280`) and **window_height** (default: `1100`)
|
||||
Browser window dimensions. The default size is optimized for general use cases and interaction with common UI elements like cookie banners.
|
||||
|
||||
- **locale** (default: `None`)
|
||||
Specify user locale, for example en-GB, de-DE, etc. Locale will affect the navigator. Language value, Accept-Language request header value as well as number and date formatting rules. If not provided, defaults to the system default locale.
|
||||
|
||||
- **highlight_elements** (default: `True`)
|
||||
Highlight interactive elements on the screen with colorful bounding boxes.
|
||||
|
||||
- **viewport_expansion** (default: `500`)
|
||||
Viewport expansion in pixels. With this you can control how much of the page is included in the context of the LLM. Setting this parameter controls the highlighting of elements:
|
||||
- `-1`: All elements from the entire page will be included, regardless of visibility (highest token usage but most complete).
|
||||
- `0`: Only elements which are currently visible in the viewport will be included.
|
||||
- `500` (default): Elements in the viewport plus an additional 500 pixels in each direction will be included, providing a balance between context and token usage.
|
||||
|
||||
### Restrict URLs
|
||||
|
||||
- **allowed_domains** (default: `None`)
|
||||
List of allowed domains that the agent can access. If None, all domains are allowed.
|
||||
Example: ['google.com', '*.wikipedia.org'] - Here the agent will only be able to access `google.com` exactly and `wikipedia.org` + `*.wikipedia.org`.
|
||||
|
||||
Glob patterns are supported:
|
||||
- `['example.com']` ✅ will match only `example.com` exactly, subdomains will not be allowed.
|
||||
It's always the most secure to list all the domains you want to give the access to explicitly e.g.
|
||||
`['google.com', 'www.google.com', 'myaccount.google.com', 'mail.google.com', 'docs.google.com']`
|
||||
- `['*.example.com']` ⚠️ **CAUTION** this will match `example.com` and *all* subdomains.
|
||||
Make sure *all* the subdomains are safe for the agent! `abc.example.com`, `def.example.com`, ..., `useruploads.example.com`, `admin.example.com`
|
||||
- `['*google.com']` ❌ **DON'T DO THIS**, it will match any domains that end in `google.com`, *including `evilgoogle.com`*
|
||||
- `['*.google.*']` ❌ **DON'T DO THIS**, it will match `google.com`, `google.co.uk`, `google.fr`, etc. *but also `www.google.evil.com`*
|
||||
|
||||
### Session Management
|
||||
|
||||
- **keep_alive** (default: `False`)
|
||||
Keeps the browser context (tab/session) alive after an agent task has completed. This is useful for maintaining session state across multiple tasks.
|
||||
|
||||
### Debug and Recording
|
||||
|
||||
- **save_recording_path** (default: `None`)
|
||||
Directory path for saving video recordings.
|
||||
|
||||
- **trace_path** (default: `None`)
|
||||
Directory path for saving trace files. Files are automatically named as `{trace_path}/{context_id}.zip`.
|
||||
|
||||
- **save_playwright_script_path** (default: `None`)
|
||||
BETA: Filename to save a replayable playwright python script to containing the steps the agent took.
|
||||
133
browser-use/docs/customize/custom-functions.mdx
Normal file
133
browser-use/docs/customize/custom-functions.mdx
Normal file
|
|
@ -0,0 +1,133 @@
|
|||
---
|
||||
title: "Custom Functions"
|
||||
description: "Extend default agent and write custom function calls"
|
||||
icon: "function"
|
||||
---
|
||||
|
||||
## Basic Function Registration
|
||||
|
||||
Functions can be either `sync` or `async`. Keep them focused and single-purpose.
|
||||
|
||||
```python
|
||||
from browser_use import Controller, ActionResult
|
||||
# Initialize the controller
|
||||
controller = Controller()
|
||||
|
||||
@controller.action('Ask user for information')
|
||||
def ask_human(question: str) -> str:
|
||||
answer = input(f'\n{question}\nInput: ')
|
||||
return ActionResult(extracted_content=answer)
|
||||
```
|
||||
|
||||
<Note>
|
||||
Basic `Controller` has all basic functionality you might need to interact with
|
||||
the browser already implemented.
|
||||
</Note>
|
||||
|
||||
```python
|
||||
# ... then pass controller to the agent
|
||||
agent = Agent(
|
||||
task=task,
|
||||
llm=llm,
|
||||
controller=controller
|
||||
)
|
||||
```
|
||||
|
||||
<Note>
|
||||
Keep the function name and description short and concise. The Agent use the
|
||||
function solely based on the name and description. The stringified output of
|
||||
the action is passed to the Agent.
|
||||
</Note>
|
||||
|
||||
## Browser-Aware Functions
|
||||
|
||||
For actions that need browser access, simply add the `browser` parameter inside the function parameters:
|
||||
|
||||
<Note>
|
||||
Please note that browser-use’s `Browser` class is a wrapper class around
|
||||
Playwright’s `Browser`. The `Browser.playwright_browser` attr can be used
|
||||
to directly access the Playwright browser object if needed.
|
||||
</Note>
|
||||
|
||||
```python
|
||||
from browser_use import Browser, Controller, ActionResult
|
||||
|
||||
controller = Controller()
|
||||
@controller.action('Open website')
|
||||
async def open_website(url: str, browser: Browser):
|
||||
page = await browser.get_current_page()
|
||||
await page.goto(url)
|
||||
return ActionResult(extracted_content='Website opened')
|
||||
```
|
||||
|
||||
## Structured Parameters with Pydantic
|
||||
|
||||
For complex actions, you can define parameter schemas using Pydantic models:
|
||||
|
||||
```python
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional
|
||||
from browser_use import Controller, ActionResult, Browser
|
||||
|
||||
controller = Controller()
|
||||
|
||||
class JobDetails(BaseModel):
|
||||
title: str
|
||||
company: str
|
||||
job_link: str
|
||||
salary: Optional[str] = None
|
||||
|
||||
@controller.action(
|
||||
'Save job details which you found on page',
|
||||
param_model=JobDetails
|
||||
)
|
||||
async def save_job(params: JobDetails, browser: Browser):
|
||||
print(f"Saving job: {params.title} at {params.company}")
|
||||
|
||||
# Access browser if needed
|
||||
page = browser.get_current_page()
|
||||
await page.goto(params.job_link)
|
||||
```
|
||||
|
||||
## Using Custom Actions with multiple agents
|
||||
|
||||
You can use the same controller for multiple agents.
|
||||
|
||||
```python
|
||||
controller = Controller()
|
||||
|
||||
# ... register actions to the controller
|
||||
|
||||
agent = Agent(
|
||||
task="Go to website X and find the latest news",
|
||||
llm=llm,
|
||||
controller=controller
|
||||
)
|
||||
|
||||
# Run the agent
|
||||
await agent.run()
|
||||
|
||||
agent2 = Agent(
|
||||
task="Go to website Y and find the latest news",
|
||||
llm=llm,
|
||||
controller=controller
|
||||
)
|
||||
|
||||
await agent2.run()
|
||||
```
|
||||
|
||||
<Note>
|
||||
The controller is stateless and can be used to register multiple actions and
|
||||
multiple agents.
|
||||
</Note>
|
||||
|
||||
|
||||
|
||||
## Exclude functions
|
||||
If you want less actions to be used by the agent, you can exclude them from the controller.
|
||||
```python
|
||||
controller = Controller(exclude_actions=['open_tab', 'search_google'])
|
||||
```
|
||||
|
||||
|
||||
For more examples like file upload or notifications, visit [examples/custom-functions](https://github.com/browser-use/browser-use/tree/main/examples/custom-functions).
|
||||
365
browser-use/docs/customize/hooks.mdx
Normal file
365
browser-use/docs/customize/hooks.mdx
Normal file
|
|
@ -0,0 +1,365 @@
|
|||
---
|
||||
title: "Lifecycle Hooks"
|
||||
description: "Customize agent behavior with lifecycle hooks"
|
||||
icon: "Wrench"
|
||||
author: "Carlos A. Planchón"
|
||||
---
|
||||
|
||||
# Using Agent Lifecycle Hooks
|
||||
|
||||
Browser-Use provides lifecycle hooks that allow you to execute custom code at specific points during the agent's execution. These hooks enable you to capture detailed information about the agent's actions, modify behavior, or integrate with external systems.
|
||||
|
||||
## Available Hooks
|
||||
|
||||
Currently, Browser-Use provides the following hooks:
|
||||
|
||||
| Hook | Description | When it's called |
|
||||
| ---- | ----------- | ---------------- |
|
||||
| `on_step_start` | Executed at the beginning of each agent step | Before the agent processes the current state and decides on the next action |
|
||||
| `on_step_end` | Executed at the end of each agent step | After the agent has executed the action for the current step |
|
||||
|
||||
## Using Hooks
|
||||
|
||||
Hooks are passed as parameters to the `agent.run()` method. Each hook should be a callable function that accepts the agent instance as its parameter.
|
||||
|
||||
### Basic Example
|
||||
|
||||
```python
|
||||
from browser_use import Agent
|
||||
from langchain_openai import ChatOpenAI
|
||||
|
||||
|
||||
async def my_step_hook(agent):
|
||||
# inside a hook you can access all the state and methods under the Agent object:
|
||||
# agent.settings, agent.state, agent.task
|
||||
# agent.controller, agent.llm, agent.browser, agent.browser_context
|
||||
# agent.pause(), agent.resume(), agent.add_new_task(...), etc.
|
||||
|
||||
# You also have direct access to the playwright Page and Browser Context
|
||||
page = await agent.browser_context.get_current_page()
|
||||
# https://playwright.dev/python/docs/api/class-page
|
||||
|
||||
current_url = page.url
|
||||
visit_log = agent.state.history.urls()
|
||||
previous_url = visit_log[-2] if len(visit_log) >= 2 else None
|
||||
print(f"Agent was last on URL: {previous_url} and is now on {current_url}")
|
||||
|
||||
# Example: listen for events on the page, interact with the DOM, run JS directly, etc.
|
||||
await page.on('domcontentloaded', lambda: print('page navigated to a new url...'))
|
||||
await page.locator("css=form > input[type=submit]").click()
|
||||
await page.evaluate('() => alert(1)')
|
||||
await page.browser.new_tab
|
||||
await agent.browser_context.session.context.add_init_script('/* some JS to run on every page */')
|
||||
|
||||
# Example: monitor or intercept all network requests
|
||||
async def handle_request(route):
|
||||
# Print, modify, block, etc. do anything to the requests here
|
||||
# https://playwright.dev/python/docs/network#handle-requests
|
||||
print(route.request, route.request.headers)
|
||||
await route.continue_(headers=route.request.headers)
|
||||
await page.route("**/*", handle_route)
|
||||
|
||||
# Example: pause agent execution and resume it based on some custom code
|
||||
if '/completed' in current_url:
|
||||
agent.pause()
|
||||
Path('result.txt').write_text(await page.content())
|
||||
input('Saved "completed" page content to result.txt, press [Enter] to resume...')
|
||||
agent.resume()
|
||||
|
||||
agent = Agent(
|
||||
task="Search for the latest news about AI",
|
||||
llm=ChatOpenAI(model="gpt-4o"),
|
||||
)
|
||||
|
||||
await agent.run(
|
||||
on_step_start=my_step_hook,
|
||||
# on_step_end=...
|
||||
max_steps=10
|
||||
)
|
||||
```
|
||||
|
||||
## Complete Example: Agent Activity Recording System
|
||||
|
||||
This comprehensive example demonstrates a complete implementation for recording and saving Browser-Use agent activity, consisting of both server and client components.
|
||||
|
||||
### Setup Instructions
|
||||
|
||||
To use this example, you'll need to:
|
||||
|
||||
1. Set up the required dependencies:
|
||||
```bash
|
||||
pip install fastapi uvicorn prettyprinter pyobjtojson dotenv browser-use langchain-openai
|
||||
```
|
||||
|
||||
2. Create two separate Python files:
|
||||
- `api.py` - The FastAPI server component
|
||||
- `client.py` - The Browser-Use agent with recording hook
|
||||
|
||||
3. Run both components:
|
||||
- Start the API server first: `python api.py`
|
||||
- Then run the client: `python client.py`
|
||||
|
||||
### Server Component (api.py)
|
||||
|
||||
The server component handles receiving and storing the agent's activity data:
|
||||
|
||||
```python
|
||||
#!/usr/bin/env python3
|
||||
|
||||
#
|
||||
# FastAPI API to record and save Browser-Use activity data.
|
||||
# Save this code to api.py and run with `python api.py`
|
||||
#
|
||||
|
||||
import json
|
||||
import base64
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import FastAPI, Request
|
||||
import prettyprinter
|
||||
import uvicorn
|
||||
|
||||
prettyprinter.install_extras()
|
||||
|
||||
# Utility function to save screenshots
|
||||
def b64_to_png(b64_string: str, output_file):
|
||||
"""
|
||||
Convert a Base64-encoded string to a PNG file.
|
||||
|
||||
:param b64_string: A string containing Base64-encoded data
|
||||
:param output_file: The path to the output PNG file
|
||||
"""
|
||||
with open(output_file, "wb") as f:
|
||||
f.write(base64.b64decode(b64_string))
|
||||
|
||||
# Initialize FastAPI app
|
||||
app = FastAPI()
|
||||
|
||||
|
||||
@app.post("/post_agent_history_step")
|
||||
async def post_agent_history_step(request: Request):
|
||||
data = await request.json()
|
||||
prettyprinter.cpprint(data)
|
||||
|
||||
# Ensure the "recordings" folder exists using pathlib
|
||||
recordings_folder = Path("recordings")
|
||||
recordings_folder.mkdir(exist_ok=True)
|
||||
|
||||
# Determine the next file number by examining existing .json files
|
||||
existing_numbers = []
|
||||
for item in recordings_folder.iterdir():
|
||||
if item.is_file() and item.suffix == ".json":
|
||||
try:
|
||||
file_num = int(item.stem)
|
||||
existing_numbers.append(file_num)
|
||||
except ValueError:
|
||||
# In case the file name isn't just a number
|
||||
pass
|
||||
|
||||
if existing_numbers:
|
||||
next_number = max(existing_numbers) + 1
|
||||
else:
|
||||
next_number = 1
|
||||
|
||||
# Construct the file path
|
||||
file_path = recordings_folder / f"{next_number}.json"
|
||||
|
||||
# Save the JSON data to the file
|
||||
with file_path.open("w") as f:
|
||||
json.dump(data, f, indent=2)
|
||||
|
||||
# Optionally save screenshot if needed
|
||||
# if "website_screenshot" in data and data["website_screenshot"]:
|
||||
# screenshot_folder = Path("screenshots")
|
||||
# screenshot_folder.mkdir(exist_ok=True)
|
||||
# b64_to_png(data["website_screenshot"], screenshot_folder / f"{next_number}.png")
|
||||
|
||||
return {"status": "ok", "message": f"Saved to {file_path}"}
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("Starting Browser-Use recording API on http://0.0.0.0:9000")
|
||||
uvicorn.run(app, host="0.0.0.0", port=9000)
|
||||
```
|
||||
|
||||
### Client Component (client.py)
|
||||
|
||||
The client component runs the Browser-Use agent with a recording hook:
|
||||
|
||||
```python
|
||||
#!/usr/bin/env python3
|
||||
|
||||
#
|
||||
# Client to record and save Browser-Use activity.
|
||||
# Save this code to client.py and run with `python client.py`
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import requests
|
||||
from dotenv import load_dotenv
|
||||
from pyobjtojson import obj_to_json
|
||||
from langchain_openai import ChatOpenAI
|
||||
from browser_use import Agent
|
||||
|
||||
# Load environment variables (for API keys)
|
||||
load_dotenv()
|
||||
|
||||
|
||||
def send_agent_history_step(data):
|
||||
"""Send the agent step data to the recording API"""
|
||||
url = "http://127.0.0.1:9000/post_agent_history_step"
|
||||
response = requests.post(url, json=data)
|
||||
return response.json()
|
||||
|
||||
|
||||
async def record_activity(agent_obj):
|
||||
"""Hook function that captures and records agent activity at each step"""
|
||||
website_html = None
|
||||
website_screenshot = None
|
||||
urls_json_last_elem = None
|
||||
model_thoughts_last_elem = None
|
||||
model_outputs_json_last_elem = None
|
||||
model_actions_json_last_elem = None
|
||||
extracted_content_json_last_elem = None
|
||||
|
||||
print('--- ON_STEP_START HOOK ---')
|
||||
|
||||
# Capture current page state
|
||||
website_html = await agent_obj.browser_context.get_page_html()
|
||||
website_screenshot = await agent_obj.browser_context.take_screenshot()
|
||||
|
||||
# Make sure we have state history
|
||||
if hasattr(agent_obj, "state"):
|
||||
history = agent_obj.state.history
|
||||
else:
|
||||
history = None
|
||||
print("Warning: Agent has no state history")
|
||||
return
|
||||
|
||||
# Process model thoughts
|
||||
model_thoughts = obj_to_json(
|
||||
obj=history.model_thoughts(),
|
||||
check_circular=False
|
||||
)
|
||||
if len(model_thoughts) > 0:
|
||||
model_thoughts_last_elem = model_thoughts[-1]
|
||||
|
||||
# Process model outputs
|
||||
model_outputs = agent_obj.state.history.model_outputs()
|
||||
model_outputs_json = obj_to_json(
|
||||
obj=model_outputs,
|
||||
check_circular=False
|
||||
)
|
||||
if len(model_outputs_json) > 0:
|
||||
model_outputs_json_last_elem = model_outputs_json[-1]
|
||||
|
||||
# Process model actions
|
||||
model_actions = agent_obj.state.history.model_actions()
|
||||
model_actions_json = obj_to_json(
|
||||
obj=model_actions,
|
||||
check_circular=False
|
||||
)
|
||||
if len(model_actions_json) > 0:
|
||||
model_actions_json_last_elem = model_actions_json[-1]
|
||||
|
||||
# Process extracted content
|
||||
extracted_content = agent_obj.state.history.extracted_content()
|
||||
extracted_content_json = obj_to_json(
|
||||
obj=extracted_content,
|
||||
check_circular=False
|
||||
)
|
||||
if len(extracted_content_json) > 0:
|
||||
extracted_content_json_last_elem = extracted_content_json[-1]
|
||||
|
||||
# Process URLs
|
||||
urls = agent_obj.state.history.urls()
|
||||
urls_json = obj_to_json(
|
||||
obj=urls,
|
||||
check_circular=False
|
||||
)
|
||||
if len(urls_json) > 0:
|
||||
urls_json_last_elem = urls_json[-1]
|
||||
|
||||
# Create a summary of all data for this step
|
||||
model_step_summary = {
|
||||
"website_html": website_html,
|
||||
"website_screenshot": website_screenshot,
|
||||
"url": urls_json_last_elem,
|
||||
"model_thoughts": model_thoughts_last_elem,
|
||||
"model_outputs": model_outputs_json_last_elem,
|
||||
"model_actions": model_actions_json_last_elem,
|
||||
"extracted_content": extracted_content_json_last_elem
|
||||
}
|
||||
|
||||
print("--- MODEL STEP SUMMARY ---")
|
||||
print(f"URL: {urls_json_last_elem}")
|
||||
|
||||
# Send data to the API
|
||||
result = send_agent_history_step(data=model_step_summary)
|
||||
print(f"Recording API response: {result}")
|
||||
|
||||
|
||||
async def run_agent():
|
||||
"""Run the Browser-Use agent with the recording hook"""
|
||||
agent = Agent(
|
||||
task="Compare the price of gpt-4o and DeepSeek-V3",
|
||||
llm=ChatOpenAI(model="gpt-4o"),
|
||||
)
|
||||
|
||||
try:
|
||||
print("Starting Browser-Use agent with recording hook")
|
||||
await agent.run(
|
||||
on_step_start=record_activity,
|
||||
max_steps=30
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Error running agent: {e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Check if API is running
|
||||
try:
|
||||
requests.get("http://127.0.0.1:9000")
|
||||
print("Recording API is available")
|
||||
except:
|
||||
print("Warning: Recording API may not be running. Start api.py first.")
|
||||
|
||||
# Run the agent
|
||||
asyncio.run(run_agent())
|
||||
```
|
||||
|
||||
### Working with the Recorded Data
|
||||
|
||||
After running the agent, you'll find the recorded data in the `recordings` directory. Here's how you can use this data:
|
||||
|
||||
1. **View recorded sessions**: Each JSON file contains a snapshot of agent activity for one step
|
||||
2. **Extract screenshots**: You can modify the API to save screenshots separately
|
||||
3. **Analyze agent behavior**: Use the recorded data to study how the agent navigates websites
|
||||
|
||||
### Extending the Example
|
||||
|
||||
You can extend this recording system in several ways:
|
||||
|
||||
1. **Save screenshots separately**: Uncomment the screenshot saving code in the API
|
||||
2. **Add a web dashboard**: Create a simple web interface to view recorded sessions
|
||||
3. **Add session IDs**: Modify the API to group steps by agent session
|
||||
4. **Add filtering**: Implement filters to record only specific types of actions
|
||||
|
||||
## Data Available in Hooks
|
||||
|
||||
When working with agent hooks, you have access to the entire agent instance. Here are some useful data points you can access:
|
||||
|
||||
- `agent.state.history.model_thoughts()`: Reasoning from Browser Use's model.
|
||||
- `agent.state.history.model_outputs()`: Raw outputs from the Browsre Use's model.
|
||||
- `agent.state.history.model_actions()`: Actions taken by the agent
|
||||
- `agent.state.history.extracted_content()`: Content extracted from web pages
|
||||
- `agent.state.history.urls()`: URLs visited by the agent
|
||||
- `agent.browser_context.get_page_html()`: Current page HTML
|
||||
- `agent.browser_context.take_screenshot()`: Screenshot of the current page
|
||||
|
||||
## Tips for Using Hooks
|
||||
|
||||
- **Avoid blocking operations**: Since hooks run in the same execution thread as the agent, try to keep them efficient or use asynchronous patterns.
|
||||
- **Handle exceptions**: Make sure your hook functions handle exceptions gracefully to prevent interrupting the agent's main flow.
|
||||
- **Consider storage needs**: When capturing full HTML and screenshots, be mindful of storage requirements.
|
||||
|
||||
Contribution by Carlos A. Planchón.
|
||||
50
browser-use/docs/customize/output-format.mdx
Normal file
50
browser-use/docs/customize/output-format.mdx
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
---
|
||||
title: "Output Format"
|
||||
description: "The default is text. But you can define a structured output format to make post-processing easier."
|
||||
icon: "code"
|
||||
---
|
||||
|
||||
## Custom output format
|
||||
With [this example](https://github.com/browser-use/browser-use/blob/main/examples/features/custom_output.py) you can define what output format the agent should return to you.
|
||||
|
||||
```python
|
||||
from pydantic import BaseModel
|
||||
# Define the output format as a Pydantic model
|
||||
class Post(BaseModel):
|
||||
post_title: str
|
||||
post_url: str
|
||||
num_comments: int
|
||||
hours_since_post: int
|
||||
|
||||
|
||||
class Posts(BaseModel):
|
||||
posts: List[Post]
|
||||
|
||||
|
||||
controller = Controller(output_model=Posts)
|
||||
|
||||
|
||||
async def main():
|
||||
task = 'Go to hackernews show hn and give me the first 5 posts'
|
||||
model = ChatOpenAI(model='gpt-4o')
|
||||
agent = Agent(task=task, llm=model, controller=controller)
|
||||
|
||||
history = await agent.run()
|
||||
|
||||
result = history.final_result()
|
||||
if result:
|
||||
parsed: Posts = Posts.model_validate_json(result)
|
||||
|
||||
for post in parsed.posts:
|
||||
print('\n--------------------------------')
|
||||
print(f'Title: {post.post_title}')
|
||||
print(f'URL: {post.post_url}')
|
||||
print(f'Comments: {post.num_comments}')
|
||||
print(f'Hours since post: {post.hours_since_post}')
|
||||
else:
|
||||
print('No result')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
asyncio.run(main())
|
||||
```
|
||||
53
browser-use/docs/customize/real-browser.mdx
Normal file
53
browser-use/docs/customize/real-browser.mdx
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
---
|
||||
title: "Connect to your Browser"
|
||||
description: "With this you can connect to your real browser, where you are logged in with all your accounts."
|
||||
icon: "computer"
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
You can connect the agent to your real Chrome browser instance, allowing it to access your existing browser profile with all your logged-in accounts and settings. This is particularly useful when you want the agent to interact with services where you're already authenticated.
|
||||
|
||||
<Note>
|
||||
First make sure to close all running Chrome instances.
|
||||
</Note>
|
||||
|
||||
## Basic Configuration
|
||||
|
||||
To connect to your real Chrome browser, you'll need to specify the path to your Chrome executable when creating the Browser instance:
|
||||
|
||||
```python
|
||||
from browser_use import Agent, Browser, BrowserConfig
|
||||
from langchain_openai import ChatOpenAI
|
||||
import asyncio
|
||||
# Configure the browser to connect to your Chrome instance
|
||||
browser = Browser(
|
||||
config=BrowserConfig(
|
||||
# Specify the path to your Chrome executable
|
||||
browser_binary_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', # macOS path
|
||||
# For Windows, typically: 'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe'
|
||||
# For Linux, typically: '/usr/bin/google-chrome'
|
||||
)
|
||||
)
|
||||
|
||||
# Create the agent with your configured browser
|
||||
agent = Agent(
|
||||
task="Your task here",
|
||||
llm=ChatOpenAI(model='gpt-4o'),
|
||||
browser=browser,
|
||||
)
|
||||
|
||||
async def main():
|
||||
await agent.run()
|
||||
|
||||
input('Press Enter to close the browser...')
|
||||
await browser.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
|
||||
<Note>
|
||||
When using your real browser, the agent will have access to all your logged-in sessions. Make sure to ALWAYS review the task you're giving to the agent and ensure it aligns with your security requirements!
|
||||
</Note>
|
||||
76
browser-use/docs/customize/sensitive-data.mdx
Normal file
76
browser-use/docs/customize/sensitive-data.mdx
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
---
|
||||
title: "Sensitive Data"
|
||||
description: "Handle sensitive information securely by preventing the model from seeing actual passwords."
|
||||
icon: "shield"
|
||||
---
|
||||
|
||||
## Handling Sensitive Data
|
||||
|
||||
When working with sensitive information like passwords, you can use the `sensitive_data` parameter to prevent the model from seeing the actual values while still allowing it to reference them in its actions.
|
||||
|
||||
Make sure to always set [`allowed_domains`](https://docs.browser-use.com/customize/browser-settings#restrict-urls) to restrict the domains the Agent is allowed to visit when working with sensitive data or logins.
|
||||
|
||||
Here's an example of how to use sensitive data:
|
||||
|
||||
```python
|
||||
from dotenv import load_dotenv
|
||||
from langchain_openai import ChatOpenAI
|
||||
from browser_use import Agent, Browser, BrowserConfig
|
||||
from browser_use.browser.context import BrowserContextConfig
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# Initialize the model
|
||||
llm = ChatOpenAI(
|
||||
model='gpt-4o',
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
# Define sensitive data
|
||||
# The model will only see the keys (x_name, x_password) but never the actual values
|
||||
sensitive_data = {'x_name': 'magnus', 'x_password': '12345678'}
|
||||
|
||||
# Use the placeholder names in your task description
|
||||
task = 'go to x.com and login with x_name and x_password then write a post about the meaning of life'
|
||||
|
||||
# Configure allowed_domains that the agent should be restricted to in BrowserContextConfig
|
||||
context_config = BrowserContextConfig(
|
||||
allowed_domains=['example.com'],
|
||||
)
|
||||
|
||||
# Pass the sensitive data to the agent
|
||||
agent = Agent(
|
||||
task=task,
|
||||
llm=llm,
|
||||
sensitive_data=sensitive_data,
|
||||
browser=Browser(
|
||||
config=BrowserConfig(
|
||||
new_context_config=context_config
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
async def main():
|
||||
await agent.run()
|
||||
|
||||
if __name__ == '__main__':
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
In this example:
|
||||
1. The model only sees `x_name` and `x_password` as placeholders.
|
||||
2. When the model wants to use your password it outputs x_password - and we replace it with the actual value.
|
||||
3. When your password is visible on the current page, we replace it in the LLM input - so that the model never has it in its state.
|
||||
4. The agent will be prevented from going to any site not on `example.com` to protect from prompt injection attacks and jailbreaks
|
||||
|
||||
### Missing or Empty Values
|
||||
|
||||
When working with sensitive data, keep these details in mind:
|
||||
|
||||
- If a key referenced by the model (`<secret>key_name</secret>`) is missing from your `sensitive_data` dictionary, a warning will be logged but the substitution tag will be preserved.
|
||||
- If you provide an empty value for a key in the `sensitive_data` dictionary, it will be treated the same as a missing key.
|
||||
- The system will always attempt to process all valid substitutions, even if some keys are missing or empty.
|
||||
|
||||
Warning: Vision models still see the image of the page - where the sensitive data might be visible.
|
||||
|
||||
This approach ensures that sensitive information remains secure while still allowing the agent to perform tasks that require authentication.
|
||||
293
browser-use/docs/customize/supported-models.mdx
Normal file
293
browser-use/docs/customize/supported-models.mdx
Normal file
|
|
@ -0,0 +1,293 @@
|
|||
---
|
||||
title: "Supported Models"
|
||||
description: "Guide to using different LangChain chat models with Browser Use"
|
||||
icon: "robot"
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Browser Use supports various LangChain chat models. Here's how to configure and use the most popular ones. The full list is available in the [LangChain documentation](https://python.langchain.com/docs/integrations/chat/).
|
||||
|
||||
## Model Recommendations
|
||||
|
||||
We have yet to test performance across all models. Currently, we achieve the best results using GPT-4o with an 89% accuracy on the [WebVoyager Dataset](https://browser-use.com/posts/sota-technical-report). DeepSeek-V3 is 30 times cheaper than GPT-4o. Gemini-2.0-exp is also gaining popularity in the community because it is currently free.
|
||||
We also support local models, like Qwen 2.5, but be aware that small models often return the wrong output structure-which lead to parsing errors. We believe that local models will improve significantly this year.
|
||||
|
||||
|
||||
<Note>
|
||||
All models require their respective API keys. Make sure to set them in your
|
||||
environment variables before running the agent.
|
||||
</Note>
|
||||
|
||||
## Supported Models
|
||||
|
||||
All LangChain chat models, which support tool-calling are available. We will document the most popular ones here.
|
||||
|
||||
### OpenAI
|
||||
|
||||
OpenAI's GPT-4o models are recommended for best performance.
|
||||
|
||||
```python
|
||||
from langchain_openai import ChatOpenAI
|
||||
from browser_use import Agent
|
||||
|
||||
# Initialize the model
|
||||
llm = ChatOpenAI(
|
||||
model="gpt-4o",
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
# Create agent with the model
|
||||
agent = Agent(
|
||||
task="Your task here",
|
||||
llm=llm
|
||||
)
|
||||
```
|
||||
|
||||
Required environment variables:
|
||||
|
||||
```bash .env
|
||||
OPENAI_API_KEY=
|
||||
```
|
||||
|
||||
### Anthropic
|
||||
|
||||
|
||||
```python
|
||||
from langchain_anthropic import ChatAnthropic
|
||||
from browser_use import Agent
|
||||
|
||||
# Initialize the model
|
||||
llm = ChatAnthropic(
|
||||
model_name="claude-3-5-sonnet-20240620",
|
||||
temperature=0.0,
|
||||
timeout=100, # Increase for complex tasks
|
||||
)
|
||||
|
||||
# Create agent with the model
|
||||
agent = Agent(
|
||||
task="Your task here",
|
||||
llm=llm
|
||||
)
|
||||
```
|
||||
|
||||
And add the variable:
|
||||
|
||||
```bash .env
|
||||
ANTHROPIC_API_KEY=
|
||||
```
|
||||
|
||||
### Azure OpenAI
|
||||
|
||||
```python
|
||||
from langchain_openai import AzureChatOpenAI
|
||||
from browser_use import Agent
|
||||
from pydantic import SecretStr
|
||||
import os
|
||||
|
||||
# Initialize the model
|
||||
llm = AzureChatOpenAI(
|
||||
model="gpt-4o",
|
||||
api_version='2024-10-21',
|
||||
azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT', ''),
|
||||
api_key=SecretStr(os.getenv('AZURE_OPENAI_KEY', '')),
|
||||
)
|
||||
|
||||
# Create agent with the model
|
||||
agent = Agent(
|
||||
task="Your task here",
|
||||
llm=llm
|
||||
)
|
||||
```
|
||||
|
||||
Required environment variables:
|
||||
|
||||
```bash .env
|
||||
AZURE_OPENAI_ENDPOINT=https://your-endpoint.openai.azure.com/
|
||||
AZURE_OPENAI_KEY=
|
||||
```
|
||||
|
||||
|
||||
### Gemini
|
||||
|
||||
> [!IMPORTANT]
|
||||
> `GEMINI_API_KEY` was the old environment var name, it should be called `GOOGLE_API_KEY` as of 2025-05.
|
||||
|
||||
```python
|
||||
from langchain_google_genai import ChatGoogleGenerativeAI
|
||||
from browser_use import Agent
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Read GOOGLE_API_KEY into env
|
||||
load_dotenv()
|
||||
|
||||
# Initialize the model
|
||||
llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp')
|
||||
|
||||
# Create agent with the model
|
||||
agent = Agent(
|
||||
task="Your task here",
|
||||
llm=llm
|
||||
)
|
||||
```
|
||||
|
||||
Required environment variables:
|
||||
|
||||
```bash .env
|
||||
GOOGLE_API_KEY=
|
||||
```
|
||||
|
||||
|
||||
### DeepSeek-V3
|
||||
The community likes DeepSeek-V3 for its low price, no rate limits, open-source nature, and good performance.
|
||||
The example is available [here](https://github.com/browser-use/browser-use/blob/main/examples/models/deepseek.py).
|
||||
|
||||
```python
|
||||
from langchain_openai import ChatOpenAI
|
||||
from browser_use import Agent
|
||||
from pydantic import SecretStr
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
api_key = os.getenv("DEEPSEEK_API_KEY")
|
||||
|
||||
# Initialize the model
|
||||
llm=ChatOpenAI(base_url='https://api.deepseek.com/v1', model='deepseek-chat', api_key=SecretStr(api_key))
|
||||
|
||||
# Create agent with the model
|
||||
agent = Agent(
|
||||
task="Your task here",
|
||||
llm=llm,
|
||||
use_vision=False
|
||||
)
|
||||
```
|
||||
|
||||
Required environment variables:
|
||||
|
||||
```bash .env
|
||||
DEEPSEEK_API_KEY=
|
||||
```
|
||||
|
||||
### DeepSeek-R1
|
||||
We support DeepSeek-R1. Its not fully tested yet, more and more functionality will be added, like e.g. the output of it'sreasoning content.
|
||||
The example is available [here](https://github.com/browser-use/browser-use/blob/main/examples/models/deepseek-r1.py).
|
||||
It does not support vision. The model is open-source so you could also use it with Ollama, but we have not tested it.
|
||||
```python
|
||||
from langchain_openai import ChatOpenAI
|
||||
from browser_use import Agent
|
||||
from pydantic import SecretStr
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
api_key = os.getenv("DEEPSEEK_API_KEY")
|
||||
|
||||
# Initialize the model
|
||||
llm=ChatOpenAI(base_url='https://api.deepseek.com/v1', model='deepseek-reasoner', api_key=SecretStr(api_key))
|
||||
|
||||
# Create agent with the model
|
||||
agent = Agent(
|
||||
task="Your task here",
|
||||
llm=llm,
|
||||
use_vision=False
|
||||
)
|
||||
```
|
||||
|
||||
Required environment variables:
|
||||
|
||||
```bash .env
|
||||
DEEPSEEK_API_KEY=
|
||||
```
|
||||
|
||||
### Ollama
|
||||
Many users asked for local models. Here they are.
|
||||
|
||||
1. Download Ollama from [here](https://ollama.ai/download)
|
||||
2. Run `ollama pull model_name`. Pick a model which supports tool-calling from [here](https://ollama.com/search?c=tools)
|
||||
3. Run `ollama start`
|
||||
|
||||
```python
|
||||
from langchain_ollama import ChatOllama
|
||||
from browser_use import Agent
|
||||
from pydantic import SecretStr
|
||||
|
||||
|
||||
# Initialize the model
|
||||
llm=ChatOllama(model="qwen2.5", num_ctx=32000)
|
||||
|
||||
# Create agent with the model
|
||||
agent = Agent(
|
||||
task="Your task here",
|
||||
llm=llm
|
||||
)
|
||||
```
|
||||
|
||||
Required environment variables: None!
|
||||
|
||||
### Novita AI
|
||||
[Novita AI](https://novita.ai) is an LLM API provider that offers a wide range of models. Note: choose a model that supports function calling.
|
||||
|
||||
```python
|
||||
from langchain_openai import ChatOpenAI
|
||||
from browser_use import Agent
|
||||
from pydantic import SecretStr
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
|
||||
load_dotenv()
|
||||
api_key = os.getenv("NOVITA_API_KEY")
|
||||
|
||||
# Initialize the model
|
||||
llm = ChatOpenAI(base_url='https://api.novita.ai/v3/openai', model='deepseek/deepseek-v3-0324', api_key=SecretStr(api_key))
|
||||
|
||||
# Create agent with the model
|
||||
agent = Agent(
|
||||
task="Your task here",
|
||||
llm=llm,
|
||||
use_vision=False
|
||||
)
|
||||
```
|
||||
|
||||
Required environment variables:
|
||||
|
||||
```bash .env
|
||||
NOVITA_API_KEY=
|
||||
```
|
||||
### X AI
|
||||
[X AI](https://x.ai) is an LLM API provider that offers a wide range of models. Note: choose a model that supports function calling.
|
||||
|
||||
```python
|
||||
from langchain_openai import ChatOpenAI
|
||||
from browser_use import Agent
|
||||
from pydantic import SecretStr
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
|
||||
load_dotenv()
|
||||
api_key = os.getenv("GROK_API_KEY")
|
||||
|
||||
# Initialize the model
|
||||
llm = ChatOpenAI(
|
||||
base_url='https://api.x.ai/v1',
|
||||
model='grok-3-beta',
|
||||
api_key=SecretStr(api_key)
|
||||
)
|
||||
|
||||
# Create agent with the model
|
||||
agent = Agent(
|
||||
task="Your task here",
|
||||
llm=llm,
|
||||
use_vision=False
|
||||
)
|
||||
```
|
||||
|
||||
Required environment variables:
|
||||
|
||||
```bash .env
|
||||
GROK_API_KEY=
|
||||
```
|
||||
|
||||
## Coming soon
|
||||
(We are working on it)
|
||||
- Groq
|
||||
- Github
|
||||
- Fine-tuned models
|
||||
77
browser-use/docs/customize/system-prompt.mdx
Normal file
77
browser-use/docs/customize/system-prompt.mdx
Normal file
|
|
@ -0,0 +1,77 @@
|
|||
---
|
||||
title: "System Prompt"
|
||||
description: "Customize the system prompt to control agent behavior and capabilities"
|
||||
icon: "message"
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
You can customize the system prompt in two ways:
|
||||
|
||||
1. Extend the default system prompt with additional instructions
|
||||
2. Override the default system prompt entirely
|
||||
|
||||
<Note>
|
||||
Custom system prompts allow you to modify the agent's behavior at a
|
||||
fundamental level. Use this feature carefully as it can significantly impact
|
||||
the agent's performance and reliability.
|
||||
</Note>
|
||||
|
||||
### Extend System Prompt (recommended)
|
||||
|
||||
To add additional instructions to the default system prompt:
|
||||
|
||||
```python
|
||||
extend_system_message = """
|
||||
REMEMBER the most important RULE:
|
||||
ALWAYS open first a new tab and go first to url wikipedia.com no matter the task!!!
|
||||
"""
|
||||
```
|
||||
|
||||
### Override System Prompt
|
||||
|
||||
<Warning>
|
||||
Not recommended! If you must override the [default system
|
||||
prompt](https://github.com/browser-use/browser-use/blob/main/browser_use/agent/system_prompt.md),
|
||||
make sure to test the agent yourself.
|
||||
</Warning>
|
||||
|
||||
Anyway, to override the default system prompt:
|
||||
|
||||
```python
|
||||
# Define your complete custom prompt
|
||||
override_system_message = """
|
||||
You are an AI agent that helps users with web browsing tasks.
|
||||
|
||||
[Your complete custom instructions here...]
|
||||
"""
|
||||
|
||||
# Create agent with custom system prompt
|
||||
agent = Agent(
|
||||
task="Your task here",
|
||||
llm=ChatOpenAI(model='gpt-4'),
|
||||
override_system_message=override_system_message
|
||||
)
|
||||
```
|
||||
|
||||
### Extend Planner System Prompt
|
||||
|
||||
You can customize the behavior of the planning agent by extending its system prompt:
|
||||
|
||||
```python
|
||||
extend_planner_system_message = """
|
||||
PRIORITIZE gathering information before taking any action.
|
||||
Always suggest exploring multiple options before making a decision.
|
||||
"""
|
||||
|
||||
# Create agent with extended planner system prompt
|
||||
llm = ChatOpenAI(model='gpt-4o')
|
||||
planner_llm = ChatOpenAI(model='gpt-4o-mini')
|
||||
|
||||
agent = Agent(
|
||||
task="Your task here",
|
||||
llm=llm,
|
||||
planner_llm=planner_llm,
|
||||
extend_planner_system_message=extend_planner_system_message
|
||||
)
|
||||
```
|
||||
128
browser-use/docs/development.mdx
Normal file
128
browser-use/docs/development.mdx
Normal file
|
|
@ -0,0 +1,128 @@
|
|||
---
|
||||
title: 'Development'
|
||||
description: 'Preview changes locally to update your docs'
|
||||
---
|
||||
|
||||
<Info>
|
||||
**Prerequisite**: Please install Node.js (version 19 or higher) before proceeding.
|
||||
</Info>
|
||||
|
||||
Follow these steps to install and run Mintlify on your operating system:
|
||||
|
||||
**Step 1**: Install Mintlify:
|
||||
|
||||
<CodeGroup>
|
||||
|
||||
```bash npm
|
||||
npm i -g mintlify
|
||||
```
|
||||
|
||||
```bash yarn
|
||||
yarn global add mintlify
|
||||
```
|
||||
|
||||
</CodeGroup>
|
||||
|
||||
**Step 2**: Navigate to the docs directory (where the `mint.json` file is located) and execute the following command:
|
||||
|
||||
```bash
|
||||
mintlify dev
|
||||
```
|
||||
|
||||
A local preview of your documentation will be available at `http://localhost:3000`.
|
||||
|
||||
### Custom Ports
|
||||
|
||||
By default, Mintlify uses port 3000. You can customize the port Mintlify runs on by using the `--port` flag. To run Mintlify on port 3333, for instance, use this command:
|
||||
|
||||
```bash
|
||||
mintlify dev --port 3333
|
||||
```
|
||||
|
||||
If you attempt to run Mintlify on a port that's already in use, it will use the next available port:
|
||||
|
||||
```md
|
||||
Port 3000 is already in use. Trying 3001 instead.
|
||||
```
|
||||
|
||||
## Mintlify Versions
|
||||
|
||||
Please note that each CLI release is associated with a specific version of Mintlify. If your local website doesn't align with the production version, please update the CLI:
|
||||
|
||||
<CodeGroup>
|
||||
|
||||
```bash npm
|
||||
npm i -g mintlify@latest
|
||||
```
|
||||
|
||||
```bash yarn
|
||||
yarn global upgrade mintlify
|
||||
```
|
||||
|
||||
</CodeGroup>
|
||||
|
||||
## Validating Links
|
||||
|
||||
The CLI can assist with validating reference links made in your documentation. To identify any broken links, use the following command:
|
||||
|
||||
```bash
|
||||
mintlify broken-links
|
||||
```
|
||||
|
||||
## Deployment
|
||||
|
||||
<Tip>
|
||||
Unlimited editors available under the [Pro
|
||||
Plan](https://mintlify.com/pricing) and above.
|
||||
</Tip>
|
||||
|
||||
If the deployment is successful, you should see the following:
|
||||
|
||||
<Frame>
|
||||
<img src="/images/checks-passed.png" style={{ borderRadius: '0.5rem' }} />
|
||||
</Frame>
|
||||
|
||||
## Code Formatting
|
||||
|
||||
We suggest using extensions on your IDE to recognize and format MDX. If you're a VSCode user, consider the [MDX VSCode extension](https://marketplace.visualstudio.com/items?itemName=unifiedjs.vscode-mdx) for syntax highlighting, and [Prettier](https://marketplace.visualstudio.com/items?itemName=esbenp.prettier-vscode) for code formatting.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
<AccordionGroup>
|
||||
<Accordion title='Error: Could not load the "sharp" module using the darwin-arm64 runtime'>
|
||||
|
||||
This may be due to an outdated version of node. Try the following:
|
||||
1. Remove the currently-installed version of mintlify: `npm remove -g mintlify`
|
||||
2. Upgrade to Node v19 or higher.
|
||||
3. Reinstall mintlify: `npm install -g mintlify`
|
||||
</Accordion>
|
||||
|
||||
<Accordion title="Issue: Encountering an unknown error">
|
||||
|
||||
Solution: Go to the root of your device and delete the \~/.mintlify folder. Afterwards, run `mintlify dev` again.
|
||||
</Accordion>
|
||||
</AccordionGroup>
|
||||
|
||||
Curious about what changed in the CLI version? [Check out the CLI changelog.](https://www.npmjs.com/package/mintlify?activeTab=versions)
|
||||
|
||||
# Development Workflow
|
||||
|
||||
## Branches
|
||||
- **`stable`**: Mirrors the latest stable release. This branch is updated only when a new stable release is published (every few weeks).
|
||||
- **`main`**: The primary development branch. This branch is updated frequently (every hour or more).
|
||||
|
||||
## Tags
|
||||
- **`x.x.x`**: Stable release tags. These are created for stable releases and updated every few weeks.
|
||||
- **`x.x.xrcXX`**: Pre-release tags. These are created for unstable pre-releases and updated every Friday at 5 PM UTC.
|
||||
|
||||
## Workflow Summary
|
||||
1. **Push to `main`**:
|
||||
- Runs pre-commit hooks to fix formatting.
|
||||
- Executes tests to ensure code quality.
|
||||
|
||||
2. **Release a new version**:
|
||||
- If the tag is a pre-release (`x.x.xrcXX`), the package is pushed to PyPI as a pre-release.
|
||||
- If the tag is a stable release (`x.x.x`), the package is pushed to PyPI as a stable release, and the `stable` branch is updated to match the release.
|
||||
|
||||
3. **Scheduled Pre-Releases**:
|
||||
- Every Friday at 5 PM UTC, a new pre-release tag (`x.x.xrcXX`) is created from the `main` branch and pushed to the repository.
|
||||
12
browser-use/docs/development/contribution-guide.mdx
Normal file
12
browser-use/docs/development/contribution-guide.mdx
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
---
|
||||
title: "Contribution Guide"
|
||||
description: "Learn how to contribute to Browser Use"
|
||||
icon: "github"
|
||||
---
|
||||
|
||||
|
||||
- check out our most active issues or ask in [Discord](https://discord.gg/zXJJHtJf3k) for ideas of what to work on
|
||||
- get inspiration / share what you build in the [`#showcase-your-work`](https://discord.com/channels/1303749220842340412/1305549200678850642) channel and on [`awesome-browser-use-prompts`](https://github.com/browser-use/awesome-prompts)!
|
||||
- no typo/style-only nit PRs, you can submit nit fixes but only if part of larger bugfix or new feature PRs
|
||||
- include a demo screenshot/gif, tests, and ideally an example script demonstrating any changes in your PR
|
||||
- bump your issues/PRs with comments periodically if you want them to be merged faster
|
||||
48
browser-use/docs/development/evaluations.mdx
Normal file
48
browser-use/docs/development/evaluations.mdx
Normal file
|
|
@ -0,0 +1,48 @@
|
|||
---
|
||||
title: "Evaluations"
|
||||
description: "Test the Browser Use agent on standardized benchmarks"
|
||||
icon: "chart-bar"
|
||||
---
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Browser Use uses proprietary/private test sets that must never be committed to Github and must be fetched through a authorized api request.
|
||||
Accessing these test sets requires an approved Browser Use account.
|
||||
There are currently no publicly available test sets, but some may be released in the future.
|
||||
|
||||
## Get an Api Access Key
|
||||
|
||||
First, navigate to https://browser-use.tools and log in with an authorized browser use account.
|
||||
|
||||
Then, click the "Account" button at the top right of the page, and click the "Cycle New Key" button on that page.
|
||||
|
||||
Copy the resulting url and secret key into your `.env` file. It should look like this:
|
||||
|
||||
```bash .env
|
||||
EVALUATION_TOOL_URL= ...
|
||||
EVALUATION_TOOL_SECRET_KEY= ...
|
||||
```
|
||||
|
||||
## Running Evaluations
|
||||
|
||||
First, ensure your file `eval/service.py` is up to date.
|
||||
|
||||
Then run the file:
|
||||
|
||||
```bash
|
||||
python eval/service.py
|
||||
```
|
||||
|
||||
## Configuring Evaluations
|
||||
|
||||
You can modify the evaluation by providing flags to the evaluation script. For instance:
|
||||
|
||||
```bash
|
||||
python eval/service.py --parallel_runs 5 --parallel_evaluations 5 --max-steps 25 --start 0 --end 100 --model gpt-4o
|
||||
```
|
||||
|
||||
The evaluations webpage has a convenient GUI for generating these commands. To use it, navigate to https://browser-use.tools/dashboard.
|
||||
|
||||
Then click the button "New Eval Run" on the left panel. This will open a interface with selectors, inputs, sliders, and switches.
|
||||
|
||||
Input your desired configuration into the interface and copy the resulting python command at the bottom. Then run this command as before.
|
||||
119
browser-use/docs/development/local-setup.mdx
Normal file
119
browser-use/docs/development/local-setup.mdx
Normal file
|
|
@ -0,0 +1,119 @@
|
|||
---
|
||||
title: "Local Setup"
|
||||
description: "Set up Browser Use development environment locally"
|
||||
icon: "laptop-code"
|
||||
---
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Browser Use requires Python 3.11 or higher. We recommend using [uv](https://docs.astral.sh/uv/) for Python environment management.
|
||||
|
||||
## Clone the Repository
|
||||
|
||||
First, clone the Browser Use repository:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/browser-use/browser-use
|
||||
cd browser-use
|
||||
```
|
||||
|
||||
## Environment Setup
|
||||
|
||||
1. Create and activate a virtual environment:
|
||||
|
||||
```bash
|
||||
uv venv --python 3.11
|
||||
source .venv/bin/activate
|
||||
```
|
||||
|
||||
2. Install dependencies:
|
||||
|
||||
```bash
|
||||
# Install the package in editable mode with all development dependencies
|
||||
uv sync --all-extras
|
||||
|
||||
# Install the default browser
|
||||
playwright install chromium --with-deps --no-shell
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
Set up your environment variables:
|
||||
|
||||
```bash
|
||||
# Copy the example environment file
|
||||
cp .env.example .env
|
||||
```
|
||||
|
||||
Or manually create a `.env` file with the API key for the models you want to use set:
|
||||
|
||||
```bash .env
|
||||
OPENAI_API_KEY=...
|
||||
ANTHROPIC_API_KEY=
|
||||
AZURE_ENDPOINT=
|
||||
AZURE_OPENAI_API_KEY=
|
||||
GOOGLE_API_KEY=
|
||||
DEEPSEEK_API_KEY=
|
||||
GROK_API_KEY=
|
||||
NOVITA_API_KEY=
|
||||
```
|
||||
|
||||
<Note>
|
||||
You can use any LLM model supported by LangChain. See
|
||||
[LangChain Models](/customize/supported-models) for available options and their specific
|
||||
API key requirements.
|
||||
</Note>
|
||||
|
||||
## Development
|
||||
|
||||
After setup, you can:
|
||||
|
||||
- Try demos in the example library with `uv run examples/simple.py`
|
||||
- Run the linter/formatter with `uv run ruff format examples/some/file.py`
|
||||
- Run tests with `uv run pytest`
|
||||
- Build the package with `uv build`
|
||||
|
||||
### Linting
|
||||
|
||||
```bash
|
||||
# Run the linter on the whole project (must pass for PR to be allowed to merge)
|
||||
uv run pre-commit run --all-files
|
||||
|
||||
# Install the linter & formatter pre-commit hooks to run automatically
|
||||
pre-commit install --install-hooks
|
||||
|
||||
# Experimental: run the type checker
|
||||
uv run type
|
||||
```
|
||||
|
||||
### Tests
|
||||
|
||||
```bash
|
||||
# Run tests
|
||||
uv run pytest # run everything
|
||||
uv run pytest tests/test_controller.py # run a specific test file
|
||||
uv run pytest tests/test_sensitive_data.py tests/test_tab_management.py # run two test files
|
||||
uv run pytest tests/test_tab_management.py::TestTabManagement::test_user_changes_tab # run a single test
|
||||
```
|
||||
|
||||
### Build
|
||||
|
||||
```bash
|
||||
uv build
|
||||
uv pip install dist/*.whl
|
||||
|
||||
# bush build to PyPI (automatically run by Github Actions CI)
|
||||
uv publish
|
||||
```
|
||||
|
||||
## Getting Help
|
||||
|
||||
If you run into any issues:
|
||||
|
||||
1. Check our [GitHub Issues](https://github.com/browser-use/browser-use/issues)
|
||||
2. Join our [Discord community](https://link.browser-use.com/discord) for support
|
||||
|
||||
<Note>
|
||||
We welcome contributions! See our [Contribution Guide](/development/contribution-guide) for guidelines on how to help improve
|
||||
Browser Use.
|
||||
</Note>
|
||||
122
browser-use/docs/development/n8n-integration.mdx
Normal file
122
browser-use/docs/development/n8n-integration.mdx
Normal file
|
|
@ -0,0 +1,122 @@
|
|||
---
|
||||
title: 'n8n Integration'
|
||||
description: 'Learn how to integrate Browser Use with n8n workflows'
|
||||
---
|
||||
|
||||
# Browser Use n8n Integration
|
||||
|
||||
Browser Use can be integrated with [n8n](https://n8n.io), a workflow automation platform, using our community node. This integration allows you to trigger browser automation tasks directly from your n8n workflows.
|
||||
|
||||
## Installing the n8n Community Node
|
||||
|
||||
There are several ways to install the Browser Use community node in n8n:
|
||||
|
||||
### Using n8n Desktop or Cloud
|
||||
|
||||
1. Navigate to **Settings > Community Nodes**
|
||||
2. Click on **Install**
|
||||
3. Enter `n8n-nodes-browser-use` in the **Name** field
|
||||
4. Click **Install**
|
||||
|
||||
### Using a Self-hosted n8n Instance
|
||||
|
||||
Run the following command in your n8n installation directory:
|
||||
|
||||
```bash
|
||||
npm install n8n-nodes-browser-use
|
||||
```
|
||||
|
||||
### For Development
|
||||
|
||||
If you want to develop with the n8n node:
|
||||
|
||||
1. Clone the repository:
|
||||
```bash
|
||||
git clone https://github.com/draphonix/n8n-nodes-browser-use.git
|
||||
```
|
||||
2. Install dependencies:
|
||||
```bash
|
||||
cd n8n-nodes-browser-use
|
||||
npm install
|
||||
```
|
||||
3. Build the code:
|
||||
```bash
|
||||
npm run build
|
||||
```
|
||||
4. Link to your n8n installation:
|
||||
```bash
|
||||
npm link
|
||||
```
|
||||
5. In your n8n installation directory:
|
||||
```bash
|
||||
npm link n8n-nodes-browser-use
|
||||
```
|
||||
|
||||
## Setting Up Browser Use Cloud API Credentials
|
||||
|
||||
To use the Browser Use node in n8n, you need to configure API credentials:
|
||||
|
||||
1. Sign up for an account at [Browser Use Cloud](https://cloud.browser-use.com)
|
||||
2. Navigate to the Settings or API section
|
||||
3. Generate or copy your API key
|
||||
4. In n8n, create a new credential:
|
||||
- Go to **Credentials** tab
|
||||
- Click **Create New**
|
||||
- Select **Browser Use Cloud API**
|
||||
- Enter your API key
|
||||
- Save the credential
|
||||
|
||||
## Using the Browser Use Node
|
||||
|
||||
Once installed, you can add the Browser Use node to your workflows:
|
||||
|
||||
1. In your workflow editor, search for "Browser Use" in the nodes panel
|
||||
2. Add the node to your workflow
|
||||
3. Set-up the credentials
|
||||
4. Choose your saved credentials
|
||||
5. Select an operation:
|
||||
- **Run Task**: Execute a browser automation task with natural language instructions
|
||||
- **Get Task**: Retrieve task details
|
||||
- **Get Task Status**: Check task execution status
|
||||
- **Pause/Resume/Stop Task**: Control running tasks
|
||||
- **Get Task Media**: Retrieve screenshots, videos, or PDFs
|
||||
- **List Tasks**: Get a list of tasks
|
||||
|
||||
### Example: Running a Browser Task
|
||||
|
||||
Here's a simple example of how to use the Browser Use node to run a browser task:
|
||||
|
||||
1. Add the Browser Use node to your workflow
|
||||
2. Select the "Run Task" operation
|
||||
3. In the "Instructions" field, enter a natural language description of what you want the browser to do, for example:
|
||||
```
|
||||
Go to example.com, take a screenshot of the homepage, and extract all the main heading texts
|
||||
```
|
||||
4. Optionally enable "Save Browser Data" to preserve cookies and session information
|
||||
5. Connect the node to subsequent nodes to process the results
|
||||
|
||||
## Workflow Examples
|
||||
|
||||
The Browser Use n8n node enables various automation scenarios:
|
||||
|
||||
- **Web Scraping**: Extract data from websites on a schedule
|
||||
- **Form Filling**: Automate data entry across web applications
|
||||
- **Monitoring**: Check website status and capture visual evidence
|
||||
- **Report Generation**: Generate PDFs or screenshots of web dashboards
|
||||
- **Multi-step Processes**: Chain browser tasks together using session persistence
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
If you encounter issues with the Browser Use node:
|
||||
|
||||
- Verify your API key is valid and has sufficient credits
|
||||
- Check that your instructions are clear and specific
|
||||
- For complex tasks, consider breaking them into multiple steps
|
||||
- Refer to the [Browser Use documentation](https://docs.browser-use.com) for instruction best practices
|
||||
|
||||
## Resources
|
||||
|
||||
- [n8n Community Nodes Documentation](https://docs.n8n.io/integrations/community-nodes/)
|
||||
- [Browser Use Documentation](https://docs.browser-use.com)
|
||||
- [Browser Use Cloud](https://cloud.browser-use.com)
|
||||
- [n8n-nodes-browser-use GitHub Repository](https://github.com/draphonix/n8n-nodes-browser-use)
|
||||
66
browser-use/docs/development/observability.mdx
Normal file
66
browser-use/docs/development/observability.mdx
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
---
|
||||
title: "Observability"
|
||||
description: "Trace Browser Use's agent execution steps and browser sessions"
|
||||
icon: "eye"
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Browser Use has a native integration with [Laminar](https://lmnr.ai) - open-source platform for tracing, evals and labeling of AI agents.
|
||||
Read more about Laminar in the [Laminar docs](https://docs.lmnr.ai).
|
||||
|
||||
<Note>
|
||||
Laminar excels at tracing browser agents by providing unified visibility into both browser session recordings and agent execution steps.
|
||||
</Note>
|
||||
|
||||
## Setup
|
||||
|
||||
To setup Laminar, you need to install the `lmnr` package and set the `LMNR_PROJECT_API_KEY` environment variable.
|
||||
|
||||
To get your project API key, you can either:
|
||||
- Register on [Laminar Cloud](https://lmnr.ai) and get the key from your project settings
|
||||
- Or spin up a local Laminar instance and get the key from the settings page
|
||||
|
||||
```bash
|
||||
pip install 'lmnr[all]'
|
||||
export LMNR_PROJECT_API_KEY=<your-project-api-key>
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
Then, you simply initialize the Laminar at the top of your project and both Browser Use and session recordings will be automatically traced.
|
||||
|
||||
```python {5-8}
|
||||
from langchain_openai import ChatOpenAI
|
||||
from browser_use import Agent
|
||||
import asyncio
|
||||
|
||||
from lmnr import Laminar
|
||||
# this line auto-instruments Browser Use and any browser you use (local or remote)
|
||||
Laminar.initialize(project_api_key="...") # you can also pass project api key here
|
||||
|
||||
async def main():
|
||||
agent = Agent(
|
||||
task="open google, search Laminar AI",
|
||||
llm=ChatOpenAI(model="gpt-4o-mini"),
|
||||
)
|
||||
result = await agent.run()
|
||||
print(result)
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
## Viewing Traces
|
||||
|
||||
You can view traces in the Laminar UI by going to the traces tab in your project.
|
||||
When you select a trace, you can see both the browser session recording and the agent execution steps.
|
||||
|
||||
Timeline of the browser session is synced with the agent execution steps, timeline highlights indicate the agent's current step synced with the browser session.
|
||||
In the trace view, you can also see the agent's current step, the tool it's using, and the tool's input and output. Tools are highlighted in the timeline with a yellow color.
|
||||
|
||||
<img className="block" src="/images/laminar.png" alt="Laminar" />
|
||||
|
||||
|
||||
## Laminar
|
||||
|
||||
To learn more about tracing and evaluating your browser agents, check out the [Laminar docs](https://docs.lmnr.ai).
|
||||
7
browser-use/docs/development/roadmap.mdx
Normal file
7
browser-use/docs/development/roadmap.mdx
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
---
|
||||
title: "Roadmap"
|
||||
description: "Future plans and upcoming features for Browser Use"
|
||||
icon: "road"
|
||||
---
|
||||
|
||||
Big things coming soon!
|
||||
39
browser-use/docs/development/telemetry.mdx
Normal file
39
browser-use/docs/development/telemetry.mdx
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
---
|
||||
title: "Telemetry"
|
||||
description: "Understanding Browser Use's telemetry and privacy settings"
|
||||
icon: "chart-mixed"
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Browser Use collects anonymous usage data to help us understand how the library is being used and to improve the user experience. It also helps us fix bugs faster and prioritize feature development.
|
||||
|
||||
## Data Collection
|
||||
|
||||
We use [PostHog](https://posthog.com) for telemetry collection. The data is completely anonymized and contains no personally identifiable information.
|
||||
|
||||
<Note>
|
||||
We never collect personal information, credentials, or specific content from
|
||||
your browser automation tasks.
|
||||
</Note>
|
||||
|
||||
## Opting Out
|
||||
|
||||
You can disable telemetry by setting an environment variable:
|
||||
|
||||
```bash .env
|
||||
ANONYMIZED_TELEMETRY=false
|
||||
```
|
||||
|
||||
Or in your Python code:
|
||||
|
||||
```python
|
||||
import os
|
||||
os.environ["ANONYMIZED_TELEMETRY"] = "false"
|
||||
```
|
||||
|
||||
<Note>
|
||||
Even when enabled, telemetry has zero impact on the library's performance or
|
||||
functionality. Code is available in [Telemetry
|
||||
Service](https://github.com/browser-use/browser-use/tree/main/browser_use/telemetry).
|
||||
</Note>
|
||||
13
browser-use/docs/favicon.svg
Normal file
13
browser-use/docs/favicon.svg
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
<svg width="100" height="100" viewBox="0 0 100 100" fill="none" xmlns="http://www.w3.org/2000/svg">
|
||||
<g clip-path="url(#clip0_7_13)">
|
||||
<path d="M97.8916 39.0448C82.6177 33.1997 95.2199 10.8169 74.212 11.3849C48.5413 12.0793 8.31528 52.4518 12.4236 78.6851C14.4652 91.6755 24.6096 86.2218 29.3732 88.1154C32.5364 89.3652 36.2792 95.0083 40.3245 95.9047C22.4293 106.193 -0.556809 96.397 0.0102912 74.3423C0.829435 41.86 47.7474 -5.25386 81.1937 0.477571C99.8702 3.68414 102.189 23.5422 97.8916 39.0448Z" fill="white"/>
|
||||
<path d="M24.8115 57.7541L39.6068 71.7166C49.0332 80.1875 74.061 94.9706 85.403 84.9469C98.774 73.1306 70.495 32.3162 57.4769 25.802L68.9069 20.6639C86.7138 33.6796 113.783 75.9836 91.7294 94.4025C77.5014 106.282 54.5655 96.2204 41.0811 87.3707C30.8103 80.6294 15.9647 70.9591 24.8115 57.7415V57.7541Z" fill="white"/>
|
||||
<path d="M40.3373 4.75723C35.5485 4.88347 31.8055 11.1199 28.2895 12.2182C25.1642 13.1903 20.8414 10.5266 16.1408 14.0487C11.0495 17.8613 12.7891 36.0655 3.02233 40.5976C-2.98893 22.9362 0.75354 1.8789 22.4672 0.0736228C24.1433 -0.0652445 42.7822 1.17195 40.3373 4.74463V4.75723Z" fill="white"/>
|
||||
<path d="M76.1025 57.754C84.1175 71.0348 69.5871 86.2092 57.489 74.1025L76.1025 57.754Z" fill="white"/>
|
||||
</g>
|
||||
<defs>
|
||||
<clipPath id="clip0_7_13">
|
||||
<rect width="100" height="100" fill="white"/>
|
||||
</clipPath>
|
||||
</defs>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 1.3 KiB |
BIN
browser-use/docs/images/browser-use.png
Normal file
BIN
browser-use/docs/images/browser-use.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 30 KiB |
BIN
browser-use/docs/images/checks-passed.png
Normal file
BIN
browser-use/docs/images/checks-passed.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 157 KiB |
BIN
browser-use/docs/images/laminar.png
Normal file
BIN
browser-use/docs/images/laminar.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 956 KiB |
101
browser-use/docs/introduction.mdx
Normal file
101
browser-use/docs/introduction.mdx
Normal file
|
|
@ -0,0 +1,101 @@
|
|||
---
|
||||
title: "Introduction"
|
||||
description: "Welcome to Browser Use - We enable AI to control your browser"
|
||||
icon: "book-open"
|
||||
---
|
||||
|
||||
<img className="block" src="/images/browser-use.png" alt="Browser Use" />
|
||||
|
||||
## Overview
|
||||
|
||||
Browser Use is the easiest way to connect your AI agents with the browser. It makes websites accessible for AI agents by providing a powerful, yet simple interface for browser automation.
|
||||
|
||||
<Note>
|
||||
If you have used Browser Use for your project, feel free to show it off in our
|
||||
[Discord community](https://link.browser-use.com/discord)!
|
||||
</Note>
|
||||
|
||||
## Getting Started
|
||||
|
||||
<CardGroup cols={2}>
|
||||
<Card title="Quick Start" icon="rocket" href="/quickstart">
|
||||
Get up and running with Browser Use in minutes
|
||||
</Card>
|
||||
<Card
|
||||
title="Supported Models"
|
||||
icon="robot"
|
||||
href="/customize/supported-models"
|
||||
>
|
||||
Configure different LLMs for your agents
|
||||
</Card>
|
||||
<Card title="Agent Settings" icon="gear" href="/customize/agent-settings">
|
||||
Learn how to configure and customize your agents
|
||||
</Card>
|
||||
<Card title="Custom Functions" icon="code" href="/customize/custom-functions">
|
||||
Extend functionality with custom actions
|
||||
</Card>
|
||||
</CardGroup>
|
||||
|
||||
## Fancy Demos
|
||||
|
||||
### Writing in Google Docs
|
||||
|
||||
Task: Write a letter in Google Docs to my Papa, thanking him for everything, and save the document as a PDF.
|
||||
|
||||
<Frame>
|
||||
<img src="https://github.com/user-attachments/assets/242ade3e-15bc-41c2-988f-cbc5415a66aa" />
|
||||
</Frame>
|
||||
|
||||
### Job Applications
|
||||
|
||||
Task: Read my CV & find ML jobs, save them to a file, and then start applying for them in new tabs.
|
||||
|
||||
<Frame>
|
||||
<video
|
||||
controls
|
||||
src="https://github.com/user-attachments/assets/171fb4d6-0355-46f2-863e-edb04a828d04"
|
||||
/>
|
||||
</Frame>
|
||||
|
||||
### Flight Search
|
||||
|
||||
Task: Find flights on kayak.com from Zurich to Beijing.
|
||||
|
||||
<Frame>
|
||||
<img src="https://github.com/user-attachments/assets/ea605d4a-90e6-481e-a569-f0e0db7e6390" />
|
||||
</Frame>
|
||||
|
||||
### Data Collection
|
||||
|
||||
Task: Look up models with a license of cc-by-sa-4.0 and sort by most likes on Hugging Face, save top 5 to file.
|
||||
|
||||
<Frame>
|
||||
<video
|
||||
controls
|
||||
src="https://github.com/user-attachments/assets/de73ee39-432c-4b97-b4e8-939fd7f323b3"
|
||||
/>
|
||||
</Frame>
|
||||
|
||||
## Community & Support
|
||||
|
||||
<CardGroup cols={2}>
|
||||
<Card
|
||||
title="Join Discord"
|
||||
icon="discord"
|
||||
href="https://link.browser-use.com/discord"
|
||||
>
|
||||
Join our community for support and showcases
|
||||
</Card>
|
||||
<Card
|
||||
title="GitHub"
|
||||
icon="github"
|
||||
href="https://github.com/browser-use/browser-use"
|
||||
>
|
||||
Star us on GitHub and contribute to development
|
||||
</Card>
|
||||
</CardGroup>
|
||||
|
||||
<Note>
|
||||
Browser Use is MIT licensed and actively maintained. We welcome contributions
|
||||
and feedback from the community!
|
||||
</Note>
|
||||
7
browser-use/docs/logo/dark.svg
Normal file
7
browser-use/docs/logo/dark.svg
Normal file
File diff suppressed because one or more lines are too long
|
After Width: | Height: | Size: 9.2 KiB |
7
browser-use/docs/logo/light.svg
Normal file
7
browser-use/docs/logo/light.svg
Normal file
File diff suppressed because one or more lines are too long
|
After Width: | Height: | Size: 9.2 KiB |
76
browser-use/docs/quickstart.mdx
Normal file
76
browser-use/docs/quickstart.mdx
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
---
|
||||
title: "Quickstart"
|
||||
description: "Start using Browser Use with this quickstart guide"
|
||||
icon: "rocket"
|
||||
---
|
||||
|
||||
{/* You can install Browser Use from PyPI or clone it from Github. */}
|
||||
|
||||
## Prepare the environment
|
||||
|
||||
Browser Use requires Python 3.11 or higher.
|
||||
|
||||
First, we recommend using [uv](https://docs.astral.sh/uv/) to setup the Python environment.
|
||||
|
||||
```bash
|
||||
uv venv --python 3.11
|
||||
```
|
||||
|
||||
and activate it with:
|
||||
|
||||
```bash
|
||||
# For Mac/Linux:
|
||||
source .venv/bin/activate
|
||||
|
||||
# For Windows:
|
||||
.venv\Scripts\activate
|
||||
```
|
||||
|
||||
Install the dependencies:
|
||||
|
||||
```bash
|
||||
uv pip install browser-use
|
||||
```
|
||||
|
||||
Then install playwright:
|
||||
|
||||
```bash
|
||||
uv run playwright install
|
||||
```
|
||||
|
||||
## Create an agent
|
||||
|
||||
Then you can use the agent as follows:
|
||||
|
||||
```python agent.py
|
||||
from langchain_openai import ChatOpenAI
|
||||
from browser_use import Agent
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
|
||||
import asyncio
|
||||
|
||||
llm = ChatOpenAI(model="gpt-4o")
|
||||
|
||||
async def main():
|
||||
agent = Agent(
|
||||
task="Compare the price of gpt-4o and DeepSeek-V3",
|
||||
llm=llm,
|
||||
)
|
||||
result = await agent.run()
|
||||
print(result)
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
## Set up your LLM API keys
|
||||
|
||||
`ChatOpenAI` and other Langchain chat models require API keys. You should store these in your `.env` file. For example, for OpenAI and Anthropic, you can set the API keys in your `.env` file, such as:
|
||||
|
||||
|
||||
```bash .env
|
||||
OPENAI_API_KEY=
|
||||
ANTHROPIC_API_KEY=
|
||||
```
|
||||
|
||||
For other LLM models you can refer to the [Langchain documentation](https://python.langchain.com/docs/integrations/chat/) to find how to set them up with their specific API keys.
|
||||
Loading…
Add table
Add a link
Reference in a new issue