[Add] browser-use and main.py

This commit is contained in:
tv0924@icloud.com 2025-05-18 21:57:54 +09:00
commit 96914d44ac
221 changed files with 30952 additions and 1 deletions

View file

@ -0,0 +1,37 @@
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from browser_use import Agent, Browser, BrowserConfig
browser = Browser(
config=BrowserConfig(
# NOTE: you need to close your chrome browser - so that this can open your browser in debug mode
browser_binary_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
)
)
async def main():
agent = Agent(
task='In docs.google.com write my Papa a quick letter',
llm=ChatOpenAI(model='gpt-4o'),
browser=browser,
)
await agent.run()
await browser.close()
input('Press Enter to close...')
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,83 @@
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from browser_use import Agent, Browser, BrowserConfig, BrowserContextConfig
llm = ChatOpenAI(model='gpt-4o')
browser = Browser(
config=BrowserConfig(
headless=False,
disable_security=False,
keep_alive=True,
new_context_config=BrowserContextConfig(
keep_alive=True,
disable_security=False,
),
)
)
async def main():
agent = Agent(
task="""
Go to https://bot-detector.rebrowser.net/ and verify that all the bot checks are passed.
""",
llm=llm,
browser=browser,
)
await agent.run()
input('Press Enter to continue to the next test...')
agent = Agent(
task="""
Go to https://www.webflow.com/ and verify that the page is not blocked by a bot check.
""",
llm=llm,
browser=browser,
)
await agent.run()
input('Press Enter to continue to the next test...')
agent = Agent(
task="""
Go to https://www.okta.com/ and verify that the page is not blocked by a bot check.
""",
llm=llm,
browser=browser,
)
await agent.run()
agent = Agent(
task="""
Go to https://abrahamjuliot.github.io/creepjs/ and verify that the detection score is >50%.
""",
llm=llm,
browser=browser,
)
await agent.run()
input('Press Enter to close the browser...')
agent = Agent(
task="""
Go to https://nowsecure.nl/ check the "I'm not a robot" checkbox.
""",
llm=llm,
browser=browser,
)
await agent.run()
input('Press Enter to close the browser...')
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,61 @@
"""
Simple demonstration of the CDP feature.
To test this locally, follow these steps:
1. Create a shortcut for the executable Chrome file.
2. Add the following argument to the shortcut:
- On Windows: `--remote-debugging-port=9222`
3. Open a web browser and navigate to `http://localhost:9222/json/version` to verify that the Remote Debugging Protocol (CDP) is running.
4. Launch this example.
@dev You need to set the `GOOGLE_API_KEY` environment variable before proceeding.
"""
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_google_genai import ChatGoogleGenerativeAI
from pydantic import SecretStr
from browser_use import Agent, Controller
from browser_use.browser.browser import Browser, BrowserConfig
api_key = os.getenv('GOOGLE_API_KEY')
if not api_key:
raise ValueError('GOOGLE_API_KEY is not set')
browser = Browser(
config=BrowserConfig(
headless=False,
cdp_url='http://localhost:9222',
)
)
controller = Controller()
async def main():
task = 'In docs.google.com write my Papa a quick thank you for everything letter \n - Magnus'
task += ' and save the document as pdf'
model = ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp', api_key=SecretStr(str(api_key)))
agent = Agent(
task=task,
llm=model,
controller=controller,
browser=browser,
)
await agent.run()
await browser.close()
input('Press Enter to close...')
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,94 @@
"""
Action filters (domains and page_filter) let you limit actions available to the Agent on a step-by-step/page-by-page basis.
@registry.action(..., domains=['*'], page_filter=lambda page: return True)
async def some_action(browser: BrowserContext):
...
This helps prevent the LLM from deciding to use an action that is not compatible with the current page.
It helps limit decision fatique by scoping actions only to pages where they make sense.
It also helps prevent mis-triggering stateful actions or actions that could break other programs or leak secrets.
For example:
- only run on certain domains @registry.action(..., domains=['example.com', '*.example.com', 'example.co.*']) (supports globs, but no regex)
- only fill in a password on a specific login page url
- only run if this action has not run before on this page (e.g. by looking up the url in a file on disk)
During each step, the agent recalculates the actions available specifically for that page, and informs the LLM.
"""
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from playwright.async_api import Page
from browser_use.agent.service import Agent, Browser, BrowserContext, Controller
# Initialize controller and registry
controller = Controller()
registry = controller.registry
# Action will only be available to Agent on Google domains because of the domain filter
@registry.action(description='Trigger disco mode', domains=['google.com', '*.google.com'])
async def disco_mode(browser: BrowserContext):
page = await browser.get_current_page()
await page.evaluate("""() => {
// define the wiggle animation
document.styleSheets[0].insertRule('@keyframes wiggle { 0% { transform: rotate(0deg); } 50% { transform: rotate(10deg); } 100% { transform: rotate(0deg); } }');
document.querySelectorAll("*").forEach(element => {
element.style.animation = "wiggle 0.5s infinite";
});
}""")
# you can create a custom page filter function that determines if the action should be available for a given page
def is_login_page(page: Page) -> bool:
return 'login' in page.url.lower() or 'signin' in page.url.lower()
# then use it in the action decorator to limit the action to only be available on pages where the filter returns True
@registry.action(description='Use the force, luke', page_filter=is_login_page)
async def use_the_force(browser: BrowserContext):
# this will only ever run on pages that matched the filter
page = await browser.get_current_page()
assert is_login_page(page)
await page.evaluate("""() => { document.querySelector('body').innerHTML = 'These are not the droids you are looking for';}""")
async def main():
"""Main function to run the example"""
browser = Browser()
llm = ChatOpenAI(model_name='gpt-4o')
# Create the agent
agent = Agent( # disco mode will not be triggered on apple.com because the LLM won't be able to see that action available, it should work on Google.com though.
task="""
Go to apple.com and trigger disco mode (if dont know how to do that, then just move on).
Then go to google.com and trigger disco mode.
After that, go to the Google login page and Use the force, luke.
""",
llm=llm,
browser=browser,
controller=controller,
)
# Run the agent
await agent.run(max_steps=10)
# Cleanup
await browser.close()
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,98 @@
import asyncio
import json
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
import httpx
from langchain_openai import ChatOpenAI
from pydantic import BaseModel
from browser_use import ActionResult, Agent, Controller
class Person(BaseModel):
name: str
email: str | None = None
class PersonList(BaseModel):
people: list[Person]
controller = Controller(exclude_actions=['search_google'], output_model=PersonList)
BEARER_TOKEN = os.getenv('BEARER_TOKEN')
if not BEARER_TOKEN:
# use the api key for ask tessa
# you can also use other apis like exa, xAI, perplexity, etc.
raise ValueError('BEARER_TOKEN is not set - go to https://www.heytessa.ai/ and create an api key')
@controller.registry.action('Search the web for a specific query')
async def search_web(query: str):
keys_to_use = ['url', 'title', 'content', 'author', 'score']
headers = {'Authorization': f'Bearer {BEARER_TOKEN}'}
async with httpx.AsyncClient() as client:
response = await client.post('https://asktessa.ai/api/search', headers=headers, json={'query': query})
final_results = [
{key: source[key] for key in keys_to_use if key in source}
for source in response.json()['sources']
if source['score'] >= 0.8
]
# print(json.dumps(final_results, indent=4))
result_text = json.dumps(final_results, indent=4)
print(result_text)
return ActionResult(extracted_content=result_text, include_in_memory=True)
names = [
'Ruedi Aebersold',
'Bernd Bodenmiller',
'Eugene Demler',
'Erich Fischer',
'Pietro Gambardella',
'Matthias Huss',
'Reto Knutti',
'Maksym Kovalenko',
'Antonio Lanzavecchia',
'Maria Lukatskaya',
'Jochen Markard',
'Javier Pérez-Ramírez',
'Federica Sallusto',
'Gisbert Schneider',
'Sonia I. Seneviratne',
'Michael Siegrist',
'Johan Six',
'Tanja Stadler',
'Shinichi Sunagawa',
'Michael Bruce Zimmermann',
]
async def main():
task = 'use search_web with "find email address of the following ETH professor:" for each of the following persons in a list of actions. Finally return the list with name and email if provided'
task += '\n' + '\n'.join(names)
model = ChatOpenAI(model='gpt-4o')
agent = Agent(task=task, llm=model, controller=controller, max_actions_per_step=20)
history = await agent.run()
result = history.final_result()
if result:
parsed: PersonList = PersonList.model_validate_json(result)
for person in parsed.people:
print(f'{person.name} - {person.email}')
else:
print('No result')
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,60 @@
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
import pyperclip
from langchain_openai import ChatOpenAI
from browser_use import Agent, Controller
from browser_use.agent.views import ActionResult
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import BrowserContext
browser = Browser(
config=BrowserConfig(
headless=False,
)
)
controller = Controller()
@controller.registry.action('Copy text to clipboard')
def copy_to_clipboard(text: str):
pyperclip.copy(text)
return ActionResult(extracted_content=text)
@controller.registry.action('Paste text from clipboard')
async def paste_from_clipboard(browser: BrowserContext):
text = pyperclip.paste()
# send text to browser
page = await browser.get_current_page()
await page.keyboard.type(text)
return ActionResult(extracted_content=text)
async def main():
task = 'Copy the text "Hello, world!" to the clipboard, then go to google.com and paste the text'
model = ChatOpenAI(model='gpt-4o')
agent = Agent(
task=task,
llm=model,
controller=controller,
browser=browser,
)
await agent.run()
await browser.close()
input('Press Enter to close...')
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,236 @@
"""
Description: These Python modules are designed to capture detailed
browser usage datafor analysis, with both server and client
components working together to record and store the information.
Author: Carlos A. Planchón
https://github.com/carlosplanchon/
Adapt this code to your needs.
Feedback is appreciated!
"""
#####################
# #
# --- UTILS --- #
# #
#####################
import base64
def b64_to_png(b64_string: str, output_file):
"""
Convert a Base64-encoded string to a PNG file.
:param b64_string: A string containing Base64-encoded data
:param output_file: The path to the output PNG file
"""
with open(output_file, 'wb') as f:
f.write(base64.b64decode(b64_string))
###################################################################
# #
# --- FASTAPI API TO RECORD AND SAVE Browser-Use ACTIVITY --- #
# #
###################################################################
# Save to api.py and run with `python api.py`
# ! pip install uvicorn
# ! pip install fastapi
# ! pip install prettyprinter
import json
from pathlib import Path
import prettyprinter
from fastapi import FastAPI, Request
prettyprinter.install_extras()
app = FastAPI()
@app.post('/post_agent_history_step')
async def post_agent_history_step(request: Request):
data = await request.json()
prettyprinter.cpprint(data)
# Ensure the "recordings" folder exists using pathlib
recordings_folder = Path('recordings')
recordings_folder.mkdir(exist_ok=True)
# Determine the next file number by examining existing .json files
existing_numbers = []
for item in recordings_folder.iterdir():
if item.is_file() and item.suffix == '.json':
try:
file_num = int(item.stem)
existing_numbers.append(file_num)
except ValueError:
# In case the file name isn't just a number
...
if existing_numbers:
next_number = max(existing_numbers) + 1
else:
next_number = 1
# Construct the file path
file_path = recordings_folder / f'{next_number}.json'
# Save the JSON data to the file
with file_path.open('w') as f:
json.dump(data, f, indent=2)
return {'status': 'ok', 'message': f'Saved to {file_path}'}
if __name__ == '__main__':
import uvicorn
uvicorn.run(app, host='0.0.0.0', port=9000)
##############################################################
# #
# --- CLIENT TO RECORD AND SAVE Browser-Use ACTIVITY --- #
# #
##############################################################
"""
pyobjtojson:
A Python library to safely and recursively serialize any Python object
(including Pydantic models and dataclasses) into JSON-ready structures,
gracefully handling circular references.
"""
# ! pip install -U pyobjtojson
# ! pip install -U prettyprinter
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
import requests
from langchain_openai import ChatOpenAI
from pyobjtojson import obj_to_json
from browser_use import Agent
# import prettyprinter
# prettyprinter.install_extras()
def send_agent_history_step(data):
url = 'http://127.0.0.1:9000/post_agent_history_step'
response = requests.post(url, json=data)
return response.json()
async def record_activity(agent_obj):
website_html = None
website_screenshot = None
urls_json_last_elem = None
model_thoughts_last_elem = None
model_outputs_json_last_elem = None
model_actions_json_last_elem = None
extracted_content_json_last_elem = None
print('--- ON_STEP_START HOOK ---')
website_html: str = await agent_obj.browser_context.get_page_html()
website_screenshot: str = await agent_obj.browser_context.take_screenshot()
print('--> History:')
if hasattr(agent_obj, 'state'):
history = agent_obj.state.history
else:
history = None
model_thoughts = obj_to_json(obj=history.model_thoughts(), check_circular=False)
# print("--- MODEL THOUGHTS ---")
if len(model_thoughts) > 0:
model_thoughts_last_elem = model_thoughts[-1]
# prettyprinter.cpprint(model_thoughts_last_elem)
# print("--- MODEL OUTPUT ACTION ---")
model_outputs = agent_obj.state.history.model_outputs()
model_outputs_json = obj_to_json(obj=model_outputs, check_circular=False)
if len(model_outputs_json) > 0:
model_outputs_json_last_elem = model_outputs_json[-1]
# prettyprinter.cpprint(model_outputs_json_last_elem)
# print("--- MODEL INTERACTED ELEM ---")
model_actions = agent_obj.state.history.model_actions()
model_actions_json = obj_to_json(obj=model_actions, check_circular=False)
if len(model_actions_json) > 0:
model_actions_json_last_elem = model_actions_json[-1]
# prettyprinter.cpprint(model_actions_json_last_elem)
# print("--- EXTRACTED CONTENT ---")
extracted_content = agent_obj.state.history.extracted_content()
extracted_content_json = obj_to_json(obj=extracted_content, check_circular=False)
if len(extracted_content_json) > 0:
extracted_content_json_last_elem = extracted_content_json[-1]
# prettyprinter.cpprint(extracted_content_json_last_elem)
# print("--- URLS ---")
urls = agent_obj.state.history.urls()
# prettyprinter.cpprint(urls)
urls_json = obj_to_json(obj=urls, check_circular=False)
if len(urls_json) > 0:
urls_json_last_elem = urls_json[-1]
# prettyprinter.cpprint(urls_json_last_elem)
model_step_summary = {
'website_html': website_html,
'website_screenshot': website_screenshot,
'url': urls_json_last_elem,
'model_thoughts': model_thoughts_last_elem,
'model_outputs': model_outputs_json_last_elem,
'model_actions': model_actions_json_last_elem,
'extracted_content': extracted_content_json_last_elem,
}
print('--- MODEL STEP SUMMARY ---')
# prettyprinter.cpprint(model_step_summary)
send_agent_history_step(data=model_step_summary)
# response = send_agent_history_step(data=history)
# print(response)
# print("--> Website HTML:")
# print(website_html[:200])
# print("--> Website Screenshot:")
# print(website_screenshot[:200])
agent = Agent(
task='Compare the price of gpt-4o and DeepSeek-V3',
llm=ChatOpenAI(model='gpt-4o'),
)
async def run_agent():
try:
await agent.run(on_step_start=record_activity, max_steps=30)
except Exception as e:
print(e)
asyncio.run(run_agent())

View file

@ -0,0 +1,112 @@
import asyncio
import logging
import os
import sys
from pathlib import Path
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
import anyio
from langchain_openai import ChatOpenAI
from browser_use import Agent, Controller
from browser_use.agent.views import ActionResult
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import BrowserContext
logger = logging.getLogger(__name__)
# Initialize controller first
browser = Browser(
config=BrowserConfig(
headless=False,
browser_binary_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
)
)
controller = Controller()
@controller.action(
'Upload file to interactive element with file path ',
)
async def upload_file(index: int, path: str, browser: BrowserContext, available_file_paths: list[str]):
if path not in available_file_paths:
return ActionResult(error=f'File path {path} is not available')
if not os.path.exists(path):
return ActionResult(error=f'File {path} does not exist')
dom_el = await browser.get_dom_element_by_index(index)
file_upload_dom_el = dom_el.get_file_upload_element()
if file_upload_dom_el is None:
msg = f'No file upload element found at index {index}'
logger.info(msg)
return ActionResult(error=msg)
file_upload_el = await browser.get_locate_element(file_upload_dom_el)
if file_upload_el is None:
msg = f'No file upload element found at index {index}'
logger.info(msg)
return ActionResult(error=msg)
try:
await file_upload_el.set_input_files(path)
msg = f'Successfully uploaded file to index {index}'
logger.info(msg)
return ActionResult(extracted_content=msg, include_in_memory=True)
except Exception as e:
msg = f'Failed to upload file to index {index}: {str(e)}'
logger.info(msg)
return ActionResult(error=msg)
@controller.action('Read the file content of a file given a path')
async def read_file(path: str, available_file_paths: list[str]):
if path not in available_file_paths:
return ActionResult(error=f'File path {path} is not available')
async with await anyio.open_file(path, 'r') as f:
content = await f.read()
msg = f'File content: {content}'
logger.info(msg)
return ActionResult(extracted_content=msg, include_in_memory=True)
def create_file(file_type: str = 'txt'):
with open(f'tmp.{file_type}', 'w') as f:
f.write('test')
file_path = Path.cwd() / f'tmp.{file_type}'
logger.info(f'Created file: {file_path}')
return str(file_path)
async def main():
task = 'Go to https://kzmpmkh2zfk1ojnpxfn1.lite.vusercontent.net/ and - read the file content and upload them to fields'
available_file_paths = [create_file('txt'), create_file('pdf'), create_file('csv')]
model = ChatOpenAI(model='gpt-4o')
agent = Agent(
task=task,
llm=model,
controller=controller,
browser=browser,
available_file_paths=available_file_paths,
)
await agent.run()
await browser.close()
input('Press Enter to close...')
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,97 @@
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from pydantic import BaseModel
from browser_use import Agent, Controller
from browser_use.agent.views import ActionResult
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import BrowserContext
class HoverAction(BaseModel):
index: int | None = None
xpath: str | None = None
selector: str | None = None
browser = Browser(
config=BrowserConfig(
headless=False,
)
)
controller = Controller()
@controller.registry.action(
'Hover over an element',
param_model=HoverAction, # Define this model with at least "index: int" field
)
async def hover_element(params: HoverAction, browser: BrowserContext):
"""
Hovers over the element specified by its index from the cached selector map or by XPath.
"""
session = await browser.get_session()
state = session.cached_state
if params.xpath:
# Use XPath to locate the element
element_handle = await browser.get_locate_element_by_xpath(params.xpath)
if element_handle is None:
raise Exception(f'Failed to locate element with XPath {params.xpath}')
elif params.selector:
# Use CSS selector to locate the element
element_handle = await browser.get_locate_element_by_css_selector(params.selector)
if element_handle is None:
raise Exception(f'Failed to locate element with CSS Selector {params.selector}')
elif params.index is not None:
# Use index to locate the element
if state is None or params.index not in state.selector_map:
raise Exception(f'Element index {params.index} does not exist - retry or use alternative actions')
element_node = state.selector_map[params.index]
element_handle = await browser.get_locate_element(element_node)
if element_handle is None:
raise Exception(f'Failed to locate element with index {params.index}')
else:
raise Exception('Either index or xpath must be provided')
try:
await element_handle.hover()
msg = (
f'🖱️ Hovered over element at index {params.index}'
if params.index is not None
else f'🖱️ Hovered over element with XPath {params.xpath}'
)
return ActionResult(extracted_content=msg, include_in_memory=True)
except Exception as e:
err_msg = f'❌ Failed to hover over element: {str(e)}'
raise Exception(err_msg)
async def main():
task = 'Open https://testpages.eviltester.com/styled/csspseudo/css-hover.html and hover the element with the css selector #hoverdivpara, then click on "Can you click me?"'
# task = 'Open https://testpages.eviltester.com/styled/csspseudo/css-hover.html and hover the element with the xpath //*[@id="hoverdivpara"], then click on "Can you click me?"'
model = ChatOpenAI(model='gpt-4o')
agent = Agent(
task=task,
llm=model,
controller=controller,
browser=browser,
)
await agent.run()
await browser.close()
input('Press Enter to close...')
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,45 @@
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from browser_use import ActionResult, Agent, Controller
controller = Controller()
@controller.registry.action('Done with task ')
async def done(text: str):
import yagmail
# To send emails use
# STEP 1: go to https://support.google.com/accounts/answer/185833
# STEP 2: Create an app password (you can't use here your normal gmail password)
# STEP 3: Use the app password in the code below for the password
yag = yagmail.SMTP('your_email@gmail.com', 'your_app_password')
yag.send(
to='recipient@example.com',
subject='Test Email',
contents=f'result\n: {text}',
)
return ActionResult(is_done=True, extracted_content='Email sent!')
async def main():
task = 'go to brower-use.com and then done'
model = ChatOpenAI(model='gpt-4o')
agent = Agent(task=task, llm=model, controller=controller)
await agent.run()
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,57 @@
import asyncio
import logging
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from onepassword.client import Client # pip install onepassword-sdk
from browser_use import ActionResult, Agent, Controller
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
OP_SERVICE_ACCOUNT_TOKEN = os.getenv('OP_SERVICE_ACCOUNT_TOKEN')
OP_ITEM_ID = os.getenv('OP_ITEM_ID') # Go to 1Password, right click on the item, click "Copy Secret Reference"
controller = Controller()
@controller.registry.action('Get 2FA code from 1Password for Google Account', domains=['*.google.com', 'google.com'])
async def get_1password_2fa() -> ActionResult:
"""
Custom action to retrieve 2FA/MFA code from 1Password using onepassword.client SDK.
"""
client = await Client.authenticate(
# setup instructions: https://github.com/1Password/onepassword-sdk-python/#-get-started
auth=OP_SERVICE_ACCOUNT_TOKEN,
integration_name='Browser-Use',
integration_version='v1.0.0',
)
mfa_code = await client.secrets.resolve(f'op://Private/{OP_ITEM_ID}/One-time passcode')
return ActionResult(extracted_content=mfa_code)
async def main():
# Example task using the 1Password 2FA action
task = 'Go to account.google.com, enter username and password, then if prompted for 2FA code, get 2FA code from 1Password for and enter it'
model = ChatOpenAI(model='gpt-4o')
agent = Agent(task=task, llm=model, controller=controller)
result = await agent.run()
print(f'Task completed with result: {result}')
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,50 @@
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from pydantic import BaseModel
from browser_use.agent.service import Agent
from browser_use.controller.service import Controller
# Initialize controller first
controller = Controller()
class Model(BaseModel):
title: str
url: str
likes: int
license: str
class Models(BaseModel):
models: list[Model]
@controller.action('Save models', param_model=Models)
def save_models(params: Models):
with open('models.txt', 'a') as f:
for model in params.models:
f.write(f'{model.title} ({model.url}): {model.likes} likes, {model.license}\n')
# video: https://preview.screen.studio/share/EtOhIk0P
async def main():
task = 'Look up models with a license of cc-by-sa-4.0 and sort by most likes on Hugging face, save top 5 to file.'
model = ChatOpenAI(model='gpt-4o')
agent = Agent(task=task, llm=model, controller=controller)
await agent.run()
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,210 @@
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from aiohttp import web # make sure to install aiohttp: pip install aiohttp
from langchain_openai import ChatOpenAI
# from langchain_google_genai import ChatGoogleGenerativeAI
from browser_use import Agent, Controller
# Define a simple HTML page
HTML_CONTENT = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Custom Select Div</title>
<style>
.custom-select {
position: relative;
width: 200px;
font-family: Arial, sans-serif;
margin-bottom: 20px;
}
.select-display {
padding: 10px;
border: 1px solid #ccc;
background-color: #fff;
cursor: pointer;
}
.select-options {
position: absolute;
top: 100%;
left: 0;
right: 0;
border: 1px solid #ccc;
border-top: none;
background-color: #fff;
display: none;
max-height: 150px;
overflow-y: auto;
z-index: 100;
}
.select-option {
padding: 10px;
cursor: pointer;
}
.select-option:hover {
background-color: #f0f0f0;
}
</style>
</head>
<body>
<div class="custom-select">
<div class="select-display">Select a fruit</div>
<div class="select-options">
<div class="select-option" data-value="option1">Apples</div>
<div class="select-option" data-value="option2">Oranges</div>
<div class="select-option" data-value="option3">Pineapples</div>
</div>
</div>
<div class="custom-select">
<div class="select-display">Select a fruit</div>
<div class="select-options">
<div class="select-option" data-value="option1">Apples</div>
<div class="select-option" data-value="option2">Oranges</div>
<div class="select-option" data-value="option3">Pineapples</div>
</div>
</div>
<div class="custom-select">
<div class="select-display">Select a fruit</div>
<div class="select-options">
<div class="select-option" data-value="option1">Apples</div>
<div class="select-option" data-value="option2">Oranges</div>
<div class="select-option" data-value="option3">Pineapples</div>
</div>
</div>
<div class="custom-select">
<div class="select-display">Select a fruit</div>
<div class="select-options">
<div class="select-option" data-value="option1">Apples</div>
<div class="select-option" data-value="option2">Oranges</div>
<div class="select-option" data-value="option3">Pineapples</div>
</div>
</div>
<label for="cars">Choose a car:</label>
<select name="cars" id="cars">
<option value="volvo">Volvo</option>
<option value="bmw">BMW</option>
<option value="mercedes">Mercedes</option>
<option value="audi">Audi</option>
</select>
<button onclick="alert('I told you!')">Don't click me</button>
<script>
document.querySelectorAll('.custom-select').forEach(customSelect => {
const selectDisplay = customSelect.querySelector('.select-display');
const selectOptions = customSelect.querySelector('.select-options');
const options = customSelect.querySelectorAll('.select-option');
selectDisplay.addEventListener('click', (e) => {
// Close all other dropdowns
document.querySelectorAll('.select-options').forEach(opt => {
if (opt !== selectOptions) opt.style.display = 'none';
});
// Toggle current dropdown
const isVisible = selectOptions.style.display === 'block';
selectOptions.style.display = isVisible ? 'none' : 'block';
e.stopPropagation();
});
options.forEach(option => {
option.addEventListener('click', () => {
selectDisplay.textContent = option.textContent;
selectDisplay.dataset.value = option.getAttribute('data-value');
selectOptions.style.display = 'none';
});
});
});
// Close all dropdowns if clicking outside
document.addEventListener('click', () => {
document.querySelectorAll('.select-options').forEach(opt => {
opt.style.display = 'none';
});
});
</script>
</body>
</html>
"""
# aiohttp request handler to serve the HTML content
async def handle_root(request):
return web.Response(text=HTML_CONTENT, content_type='text/html')
# Function to run the HTTP server
async def run_http_server():
app = web.Application()
app.router.add_get('/', handle_root)
runner = web.AppRunner(app)
await runner.setup()
site = web.TCPSite(runner, 'localhost', 8000)
await site.start()
print('HTTP server running on http://localhost:8000')
# Keep the server running indefinitely.
await asyncio.Event().wait()
# Your agent tasks and other logic
controller = Controller()
async def main():
# Start the HTTP server in the background.
server_task = asyncio.create_task(run_http_server())
# Example tasks for the agent.
xpath_task = 'Open http://localhost:8000/, click element with the xpath "/html/body/div/div[1]" and then click on Oranges'
css_selector_task = 'Open http://localhost:8000/, click element with the selector div.select-display and then click on apples'
text_task = 'Open http://localhost:8000/, click the third element with the text "Select a fruit" and then click on Apples, then click the second element with the text "Select a fruit" and then click on Oranges'
select_task = 'Open http://localhost:8000/, choose the car BMW'
button_task = 'Open http://localhost:8000/, click on the button'
llm = ChatOpenAI(model='gpt-4o')
# llm = ChatGoogleGenerativeAI(
# model="gemini-2.0-flash-lite",
# )
# Run different agent tasks.
for task in [xpath_task, css_selector_task, text_task, select_task, button_task]:
agent = Agent(
task=task,
llm=llm,
controller=controller,
)
await agent.run()
# Wait for user input before shutting down.
input('Press Enter to close...')
# Cancel the server task once finished.
server_task.cancel()
try:
await server_task
except asyncio.CancelledError:
print('HTTP server stopped.')
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,52 @@
"""
Example of how it supports cross-origin iframes.
@dev You need to add OPENAI_API_KEY to your environment variables.
"""
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from browser_use import Agent, Controller
from browser_use.browser.browser import Browser, BrowserConfig
if not os.getenv('OPENAI_API_KEY'):
raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.')
browser = Browser(
config=BrowserConfig(
browser_binary_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
)
)
controller = Controller()
async def main():
agent = Agent(
task='Click "Go cross-site (simple page)" button on https://csreis.github.io/tests/cross-site-iframe.html then tell me the text within',
llm=ChatOpenAI(model='gpt-4o', temperature=0.0),
controller=controller,
browser=browser,
)
await agent.run()
await browser.close()
input('Press Enter to close...')
if __name__ == '__main__':
try:
asyncio.run(main())
except Exception as e:
print(e)

View file

@ -0,0 +1,59 @@
"""
Show how to use custom outputs.
@dev You need to add OPENAI_API_KEY to your environment variables.
"""
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from pydantic import BaseModel
from browser_use import Agent, Controller
class Post(BaseModel):
post_title: str
post_url: str
num_comments: int
hours_since_post: int
class Posts(BaseModel):
posts: list[Post]
controller = Controller(output_model=Posts)
async def main():
task = 'Go to hackernews show hn and give me the first 5 posts'
model = ChatOpenAI(model='gpt-4o')
agent = Agent(task=task, llm=model, controller=controller)
history = await agent.run()
result = history.final_result()
if result:
parsed: Posts = Posts.model_validate_json(result)
for post in parsed.posts:
print('\n--------------------------------')
print(f'Title: {post.post_title}')
print(f'URL: {post.post_url}')
print(f'Comments: {post.num_comments}')
print(f'Hours since post: {post.hours_since_post}')
else:
print('No result')
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,39 @@
import asyncio
import json
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from browser_use import Agent
extend_system_message = (
'REMEMBER the most important RULE: ALWAYS open first a new tab and go first to url wikipedia.com no matter the task!!!'
)
# or use override_system_message to completely override the system prompt
async def main():
task = "do google search to find images of Elon Musk's wife"
model = ChatOpenAI(model='gpt-4o')
agent = Agent(task=task, llm=model, extend_system_message=extend_system_message)
print(
json.dumps(
agent.message_manager.system_prompt.model_dump(exclude_unset=True),
indent=4,
)
)
await agent.run()
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,79 @@
import argparse
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_anthropic import ChatAnthropic
from langchain_openai import ChatOpenAI
from browser_use import Agent
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import BrowserContext, BrowserContextConfig
from browser_use.controller.service import Controller
def get_llm(provider: str):
if provider == 'anthropic':
return ChatAnthropic(model_name='claude-3-5-sonnet-20240620', timeout=25, stop=None, temperature=0.0)
elif provider == 'openai':
return ChatOpenAI(model='gpt-4o', temperature=0.0)
else:
raise ValueError(f'Unsupported provider: {provider}')
# NOTE: This example is to find your current user agent string to use it in the browser_context
task = 'go to https://whatismyuseragent.com and find the current user agent string '
controller = Controller()
parser = argparse.ArgumentParser()
parser.add_argument('--query', type=str, help='The query to process', default=task)
parser.add_argument(
'--provider',
type=str,
choices=['openai', 'anthropic'],
default='openai',
help='The model provider to use (default: openai)',
)
args = parser.parse_args()
llm = get_llm(args.provider)
browser = Browser(
config=BrowserConfig(
# browser_binary_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
)
)
browser_context = BrowserContext(config=BrowserContextConfig(user_agent='foobarfoo'), browser=browser)
agent = Agent(
task=args.query,
llm=llm,
controller=controller,
# browser=browser,
browser_context=browser_context,
use_vision=True,
max_actions_per_step=1,
)
async def main():
await agent.run(max_steps=25)
input('Press Enter to close the browser...')
await browser_context.close()
asyncio.run(main())

View file

@ -0,0 +1,42 @@
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_google_genai import ChatGoogleGenerativeAI
from pydantic import SecretStr
from browser_use import Agent
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import BrowserContextConfig
api_key = os.getenv('GOOGLE_API_KEY')
if not api_key:
raise ValueError('GOOGLE_API_KEY is not set')
llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp', api_key=SecretStr(api_key))
browser = Browser(
config=BrowserConfig(
new_context_config=BrowserContextConfig(save_downloads_path=os.path.join(os.path.expanduser('~'), 'downloads'))
)
)
async def run_download():
agent = Agent(
task=('Go to "https://file-examples.com/" and download the smallest doc file.'),
llm=llm,
max_actions_per_step=8,
use_vision=True,
browser=browser,
)
await agent.run(max_steps=25)
await browser.close()
if __name__ == '__main__':
asyncio.run(run_download())

View file

@ -0,0 +1,51 @@
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_google_genai import ChatGoogleGenerativeAI
from pydantic import SecretStr
from browser_use import Agent
api_key = os.getenv('GOOGLE_API_KEY')
if not api_key:
raise ValueError('GOOGLE_API_KEY is not set')
llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp', api_key=SecretStr(api_key))
task_1 = """
Navigate to: https://sortablejs.github.io/Sortable/.
Then scroll down to the first examplw with title "Simple list example".
Drag the element with name "item 1" to below the element with name "item 3".
"""
task_2 = """
Navigate to: https://excalidraw.com/.
Click on the pencil icon (with index 40).
Then draw a triangle in the canvas.
Draw the triangle starting from coordinate (400,400).
You can use the drag and drop action to draw the triangle.
"""
async def run_search():
agent = Agent(
task=task_1,
llm=llm,
max_actions_per_step=1,
use_vision=True,
)
await agent.run(max_steps=25)
if __name__ == '__main__':
asyncio.run(run_search())

View file

@ -0,0 +1,50 @@
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from browser_use import Agent, Browser, BrowserConfig, BrowserContextConfig, Controller
# Initialize the model
llm = ChatOpenAI(
model='gpt-4o',
temperature=0.0,
)
# Get your chrome path
browser = Browser(
config=BrowserConfig(
browser_binary_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
new_context_config=BrowserContextConfig(
keep_alive=True,
),
),
)
controller = Controller()
task = 'Find the founders of browser-use and draft them a short personalized message'
agent = Agent(task=task, llm=llm, controller=controller, browser=browser)
async def main():
await agent.run()
# new_task = input('Type in a new task: ')
new_task = 'Find an image of the founders'
agent.add_new_task(new_task)
await agent.run()
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,34 @@
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from browser_use import Agent
llm = ChatOpenAI(model='gpt-4o')
initial_actions = [
{'open_tab': {'url': 'https://www.google.com'}},
{'open_tab': {'url': 'https://en.wikipedia.org/wiki/Randomness'}},
{'scroll_down': {'amount': 1000}},
]
agent = Agent(
task='What theories are displayed on the page?',
initial_actions=initial_actions,
llm=llm,
)
async def main():
await agent.run(max_steps=10)
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,33 @@
"""
Simple try of the agent.
@dev You need to add OPENAI_API_KEY to your environment variables.
"""
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from browser_use import Agent
# video: https://preview.screen.studio/share/clenCmS6
llm = ChatOpenAI(model='gpt-4o')
agent = Agent(
task='open 3 tabs with elon musk, trump, and steve jobs, then go back to the first and stop',
llm=llm,
)
async def main():
await agent.run()
asyncio.run(main())

View file

@ -0,0 +1,67 @@
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from browser_use import Agent, Browser
# Video: https://preview.screen.studio/share/8Elaq9sm
async def main():
# Persist the browser state across agents
browser = Browser()
async with await browser.new_context() as context:
model = ChatOpenAI(model='gpt-4o')
current_agent = None
async def get_input():
return await asyncio.get_event_loop().run_in_executor(
None, lambda: input('Enter task (p: pause current agent, r: resume, b: break): ')
)
while True:
task = await get_input()
if task.lower() == 'p':
# Pause the current agent if one exists
if current_agent:
current_agent.pause()
continue
elif task.lower() == 'r':
# Resume the current agent if one exists
if current_agent:
current_agent.resume()
continue
elif task.lower() == 'b':
# Break the current agent's execution if one exists
if current_agent:
current_agent.stop()
current_agent = None
continue
# If there's a current agent running, pause it before starting new one
if current_agent:
current_agent.pause()
# Create and run new agent with the task
current_agent = Agent(
task=task,
llm=model,
browser_context=context,
)
# Run the agent asynchronously without blocking
asyncio.create_task(current_agent.run())
asyncio.run(main())
# Now aad the cheapest to the cart

View file

@ -0,0 +1,70 @@
"""
Show how to use custom outputs.
@dev You need to add OPENAI_API_KEY to your environment variables.
"""
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
import anyio
from langchain_openai import ChatOpenAI
from browser_use import Agent
from browser_use.agent.views import AgentState
from browser_use.browser.browser import Browser, BrowserConfig
async def main():
task = 'Go to hackernews show hn and give me the first 5 posts'
browser = Browser(
config=BrowserConfig(
headless=True,
)
)
browser_context = await browser.new_context()
agent_state = AgentState()
for i in range(10):
agent = Agent(
task=task,
llm=ChatOpenAI(model='gpt-4o'),
browser=browser,
browser_context=browser_context,
injected_agent_state=agent_state,
page_extraction_llm=ChatOpenAI(model='gpt-4o-mini'),
)
done, valid = await agent.take_step()
print(f'Step {i}: Done: {done}, Valid: {valid}')
if done and valid:
break
agent_state.history.history = []
# Save state to file
async with await anyio.open_file('agent_state.json', 'w') as f:
serialized = agent_state.model_dump_json(exclude={'history'})
await f.write(serialized)
# Load state back from file
async with await anyio.open_file('agent_state.json', 'r') as f:
loaded_json = await f.read()
agent_state = AgentState.model_validate_json(loaded_json)
break
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,59 @@
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from browser_use.agent.service import Agent
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import BrowserContextConfig
browser = Browser(
config=BrowserConfig(
disable_security=True,
headless=False,
new_context_config=BrowserContextConfig(save_recording_path='./tmp/recordings'),
)
)
llm = ChatOpenAI(model='gpt-4o')
async def main():
agents = [
Agent(task=task, llm=llm, browser=browser)
for task in [
'Search Google for weather in Tokyo',
'Check Reddit front page title',
'Look up Bitcoin price on Coinbase',
'Find NASA image of the day',
# 'Check top story on CNN',
# 'Search latest SpaceX launch date',
# 'Look up population of Paris',
# 'Find current time in Sydney',
# 'Check who won last Super Bowl',
# 'Search trending topics on Twitter',
]
]
await asyncio.gather(*[agent.run() for agent in agents])
# async with await browser.new_context() as context:
agentX = Agent(
task='Go to apple.com and return the title of the page',
llm=llm,
browser=browser,
# browser_context=context,
)
await agentX.run()
await browser.close()
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,103 @@
import asyncio
import os
import sys
import threading
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from browser_use import Agent
class AgentController:
def __init__(self):
llm = ChatOpenAI(model='gpt-4o')
self.agent = Agent(
task='open in one action https://www.google.com, https://www.wikipedia.org, https://www.youtube.com, https://www.github.com, https://amazon.com',
llm=llm,
)
self.running = False
async def run_agent(self):
"""Run the agent"""
self.running = True
await self.agent.run()
def start(self):
"""Start the agent in a separate thread"""
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(self.run_agent())
def pause(self):
"""Pause the agent"""
self.agent.pause()
def resume(self):
"""Resume the agent"""
self.agent.resume()
def stop(self):
"""Stop the agent"""
self.agent.stop()
self.running = False
def print_menu():
print('\nAgent Control Menu:')
print('1. Start')
print('2. Pause')
print('3. Resume')
print('4. Stop')
print('5. Exit')
async def main():
controller = AgentController()
agent_thread = None
while True:
print_menu()
try:
choice = input('Enter your choice (1-5): ')
except KeyboardInterrupt:
choice = '5'
if choice == '1' and not agent_thread:
print('Starting agent...')
agent_thread = threading.Thread(target=controller.start)
agent_thread.start()
elif choice == '2':
print('Pausing agent...')
controller.pause()
elif choice == '3':
print('Resuming agent...')
controller.resume()
elif choice == '4':
print('Stopping agent...')
controller.stop()
if agent_thread:
agent_thread.join()
agent_thread = None
elif choice == '5':
print('Exiting...')
if controller.running:
controller.stop()
if agent_thread:
agent_thread.join()
break
await asyncio.sleep(0.1) # Small delay to prevent CPU spinning
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,30 @@
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from browser_use import Agent
llm = ChatOpenAI(model='gpt-4o', temperature=0.0)
planner_llm = ChatOpenAI(
model='o3-mini',
)
task = 'your task'
agent = Agent(task=task, llm=llm, planner_llm=planner_llm, use_vision_for_planner=False, planner_interval=1)
async def main():
await agent.run()
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,136 @@
import asyncio
import os
import sys
from pathlib import Path
# Ensure the project root is in the Python path if running directly
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from browser_use import Agent, Browser, BrowserConfig
# Define the task for the agent
TASK_DESCRIPTION = """
1. Go to amazon.com
2. Search for 'i7 14700k'
4. If there is an 'Add to Cart' button, open the product page and then click add to cart.
5. the open the shopping cart page /cart button/ go to cart button.
6. Scroll down to the bottom of the cart page.
7. Scroll up to the top of the cart page.
8. Finish the task.
"""
# Define the path where the Playwright script will be saved
SCRIPT_DIR = Path('./playwright_scripts')
SCRIPT_PATH = SCRIPT_DIR / 'playwright_amazon_cart_script.py'
# Helper function to stream output from the subprocess
async def stream_output(stream, prefix):
if stream is None:
print(f'{prefix}: (No stream available)')
return
while True:
line = await stream.readline()
if not line:
break
print(f'{prefix}: {line.decode().rstrip()}', flush=True)
async def main():
# Initialize the language model
llm = ChatOpenAI(model='gpt-4.1', temperature=0.0)
# Configure the browser
# Use headless=False if you want to watch the agent visually
browser_config = BrowserConfig(headless=False)
browser = Browser(config=browser_config)
# Configure the agent
# The 'save_playwright_script_path' argument tells the agent where to save the script
agent = Agent(
task=TASK_DESCRIPTION,
llm=llm,
browser=browser,
save_playwright_script_path=str(SCRIPT_PATH), # Pass the path as a string
)
print('Running the agent to generate the Playwright script...')
history = None # Initialize history to None
try:
history = await agent.run()
print('Agent finished running.')
if history and history.is_successful():
print(f'Agent completed the task successfully. Final result: {history.final_result()}')
elif history:
print('Agent finished, but the task might not be fully successful.')
if history.has_errors():
print(f'Errors encountered: {history.errors()}')
else:
print('Agent run did not return a history object.')
except Exception as e:
print(f'An error occurred during the agent run: {e}')
# Ensure browser is closed even if agent run fails
if browser:
await browser.close()
return # Exit if agent failed
# --- Execute the Generated Playwright Script ---
print(f'\nChecking if Playwright script was generated at: {SCRIPT_PATH}')
if SCRIPT_PATH.exists():
print('Playwright script found. Attempting to execute...')
try:
# Ensure the script directory exists before running
SCRIPT_DIR.mkdir(parents=True, exist_ok=True)
# Execute the generated script using asyncio.create_subprocess_exec
process = await asyncio.create_subprocess_exec(
sys.executable,
str(SCRIPT_PATH),
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
cwd=Path.cwd(), # Run from the current working directory
)
print('\n--- Playwright Script Execution ---')
# Create tasks to stream stdout and stderr concurrently
stdout_task = asyncio.create_task(stream_output(process.stdout, 'stdout'))
stderr_task = asyncio.create_task(stream_output(process.stderr, 'stderr'))
# Wait for both stream tasks and the process to finish
await asyncio.gather(stdout_task, stderr_task)
returncode = await process.wait()
print('-------------------------------------')
if returncode == 0:
print('\n✅ Playwright script executed successfully!')
else:
print(f'\n⚠️ Playwright script finished with exit code {returncode}.')
except Exception as e:
print(f'\n❌ An error occurred while executing the Playwright script: {e}')
else:
print(f'\n❌ Playwright script not found at {SCRIPT_PATH}. Generation might have failed.')
# Close the browser used by the agent (if not already closed by agent.run error handling)
# Note: The generated script manages its own browser instance.
if browser:
await browser.close()
print("Agent's browser closed.")
if __name__ == '__main__':
# Ensure the script directory is clean before running (optional)
if SCRIPT_PATH.exists():
SCRIPT_PATH.unlink()
print(f'Removed existing script: {SCRIPT_PATH}')
# Run the main async function
asyncio.run(main())

View file

@ -0,0 +1,47 @@
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from browser_use import Agent
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import BrowserContextConfig
llm = ChatOpenAI(model='gpt-4o', temperature=0.0)
task = (
"go to google.com and search for openai.com and click on the first link then extract content and scroll down - what's there?"
)
allowed_domains = ['google.com']
browser = Browser(
config=BrowserConfig(
browser_binary_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
new_context_config=BrowserContextConfig(
allowed_domains=allowed_domains,
),
),
)
agent = Agent(
task=task,
llm=llm,
browser=browser,
)
async def main():
await agent.run(max_steps=25)
input('Press Enter to close the browser...')
await browser.close()
asyncio.run(main())

View file

@ -0,0 +1,60 @@
import asyncio
import os
import sys
from pprint import pprint
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from browser_use import Agent
from browser_use.agent.views import AgentHistoryList
from browser_use.browser.browser import Browser, BrowserConfig, BrowserContextConfig
llm = ChatOpenAI(model='gpt-4o')
browser = Browser(
config=BrowserConfig(
headless=False,
disable_security=True,
)
)
async def main():
async with await browser.new_context(
config=BrowserContextConfig(
trace_path='./tmp/result_processing',
no_viewport=False,
window_width=1280,
window_height=1000,
)
) as browser_context:
agent = Agent(
task="go to google.com and type 'OpenAI' click search and give me the first url",
llm=llm,
browser_context=browser_context,
)
history: AgentHistoryList = await agent.run(max_steps=3)
print('Final Result:')
pprint(history.final_result(), indent=4)
print('\nErrors:')
pprint(history.errors(), indent=4)
# e.g. xPaths the model clicked on
print('\nModel Outputs:')
pprint(history.model_actions(), indent=4)
print('\nThoughts:')
pprint(history.model_thoughts(), indent=4)
# close browser
await browser.close()
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,34 @@
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from browser_use.agent.service import Agent
from browser_use.browser.browser import Browser
from browser_use.browser.context import BrowserContextConfig
llm = ChatOpenAI(model='gpt-4o', temperature=0.0)
async def main():
browser = Browser()
async with await browser.new_context(config=BrowserContextConfig(trace_path='./tmp/traces/')) as context:
agent = Agent(
task='Go to hackernews, then go to apple.com and return all titles of open tabs',
llm=llm,
browser_context=context,
)
await agent.run()
await browser.close()
asyncio.run(main())

View file

@ -0,0 +1,32 @@
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from browser_use import Agent
# Initialize the model
llm = ChatOpenAI(
model='gpt-4o',
temperature=0.0,
)
# the model will see x_name and x_password, but never the actual values.
sensitive_data = {'x_name': 'my_x_name', 'x_password': 'my_x_password'}
task = 'go to x.com and login with x_name and x_password then find interesting posts and like them'
agent = Agent(task=task, llm=llm, sensitive_data=sensitive_data)
async def main():
await agent.run()
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,26 @@
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from browser_use import Agent
llm = ChatOpenAI(model='gpt-4o', temperature=0.0)
small_llm = ChatOpenAI(model='gpt-4o-mini', temperature=0.0)
task = 'Find the founders of browser-use in ycombinator, extract all links and open the links one by one'
agent = Agent(task=task, llm=llm, page_extraction_llm=small_llm)
async def main():
await agent.run()
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,102 @@
import asyncio
import json
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
import anyio
from langchain_openai import ChatOpenAI
from pydantic import BaseModel
from browser_use import Agent, Browser, BrowserConfig, Controller
links = [
'https://docs.mem0.ai/components/llms/models/litellm',
'https://docs.mem0.ai/components/llms/models/mistral_AI',
'https://docs.mem0.ai/components/llms/models/ollama',
'https://docs.mem0.ai/components/llms/models/openai',
'https://docs.mem0.ai/components/llms/models/together',
'https://docs.mem0.ai/components/llms/models/xAI',
'https://docs.mem0.ai/components/llms/overview',
'https://docs.mem0.ai/components/vectordbs/config',
'https://docs.mem0.ai/components/vectordbs/dbs/azure_ai_search',
'https://docs.mem0.ai/components/vectordbs/dbs/chroma',
'https://docs.mem0.ai/components/vectordbs/dbs/elasticsearch',
'https://docs.mem0.ai/components/vectordbs/dbs/milvus',
'https://docs.mem0.ai/components/vectordbs/dbs/opensearch',
'https://docs.mem0.ai/components/vectordbs/dbs/pgvector',
'https://docs.mem0.ai/components/vectordbs/dbs/pinecone',
'https://docs.mem0.ai/components/vectordbs/dbs/qdrant',
'https://docs.mem0.ai/components/vectordbs/dbs/redis',
'https://docs.mem0.ai/components/vectordbs/dbs/supabase',
'https://docs.mem0.ai/components/vectordbs/dbs/vertex_ai_vector_search',
'https://docs.mem0.ai/components/vectordbs/dbs/weaviate',
'https://docs.mem0.ai/components/vectordbs/overview',
'https://docs.mem0.ai/contributing/development',
'https://docs.mem0.ai/contributing/documentation',
'https://docs.mem0.ai/core-concepts/memory-operations',
'https://docs.mem0.ai/core-concepts/memory-types',
]
class Link(BaseModel):
url: str
title: str
summary: str
class Links(BaseModel):
links: list[Link]
initial_actions = [
{'open_tab': {'url': 'https://docs.mem0.ai/'}},
]
controller = Controller(output_model=Links)
task_description = f"""
Visit all the links provided in {links} and summarize the content of the page with url and title. There are {len(links)} links to visit. Make sure to visit all the links. Return a json with the following format: [{{url: <url>, title: <title>, summary: <summary>}}].
Guidelines:
1. Strictly stay on the domain https://docs.mem0.ai
2. Do not visit any other websites.
3. Ignore the links that are hashed (#) or javascript (:), or mailto, or tel, or other protocols
4. Don't visit any other url other than the ones provided above.
5. Capture the unique urls which are not already visited.
6. If you visit any page that doesn't have host name docs.mem0.ai, then do not visit it and come back to the page with host name docs.mem0.ai.
"""
async def main(max_steps=500):
config = BrowserConfig(headless=True)
browser = Browser(config=config)
agent = Agent(
task=task_description,
llm=ChatOpenAI(model='gpt-4o-mini'),
controller=controller,
initial_actions=initial_actions,
enable_memory=True,
browser=browser,
)
history = await agent.run(max_steps=max_steps)
result = history.final_result()
parsed_result = []
if result:
parsed: Links = Links.model_validate_json(result)
print(f'Total parsed links: {len(parsed.links)}')
for link in parsed.links:
parsed_result.append({'title': link.title, 'url': link.url, 'summary': link.summary})
else:
print('No result')
async with await anyio.open_file('result.json', 'w+') as f:
await f.write(json.dumps(parsed_result, indent=4))
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,49 @@
"""
Demonstrate output validator.
@dev You need to add OPENAI_API_KEY to your environment variables.
"""
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from pydantic import BaseModel
from browser_use import ActionResult, Agent, Controller
controller = Controller()
class DoneResult(BaseModel):
title: str
comments: str
hours_since_start: int
# we overwrite done() in this example to demonstrate the validator
@controller.registry.action('Done with task', param_model=DoneResult)
async def done(params: DoneResult):
result = ActionResult(is_done=True, extracted_content=params.model_dump_json())
print(result)
# NOTE: this is clearly wrong - to demonstrate the validator
return 'blablabla'
async def main():
task = 'Go to hackernews hn and give me the top 1 post'
model = ChatOpenAI(model='gpt-4o')
agent = Agent(task=task, llm=model, controller=controller, validate_output=True)
# NOTE: this should fail to demonstrate the validator
await agent.run(max_steps=5)
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,123 @@
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))))
from dotenv import load_dotenv
load_dotenv()
import discord
from discord.ext import commands
from langchain_core.language_models.chat_models import BaseChatModel
from browser_use import BrowserConfig
from browser_use.agent.service import Agent, Browser
class DiscordBot(commands.Bot):
"""Discord bot implementation for Browser-Use tasks.
This bot allows users to run browser automation tasks through Discord messages.
Processes tasks asynchronously and sends the result back to the user in response to the message.
Messages must start with the configured prefix (default: "$bu") followed by the task description.
Args:
llm (BaseChatModel): Language model instance to use for task processing
prefix (str, optional): Command prefix for triggering browser tasks. Defaults to "$bu"
ack (bool, optional): Whether to acknowledge task receipt with a message. Defaults to False
browser_config (BrowserConfig, optional): Browser configuration settings.
Defaults to headless mode
Usage:
```python
from langchain_openai import ChatOpenAI
llm = ChatOpenAI()
bot = DiscordBot(llm=llm, prefix='$bu', ack=True)
bot.run('YOUR_DISCORD_TOKEN')
```
Discord Usage:
Send messages starting with the prefix:
"$bu search for python tutorials"
"""
def __init__(
self,
llm: BaseChatModel,
prefix: str = '$bu',
ack: bool = False,
browser_config: BrowserConfig = BrowserConfig(headless=True),
):
self.llm = llm
self.prefix = prefix.strip()
self.ack = ack
self.browser_config = browser_config
# Define intents.
intents = discord.Intents.default()
intents.message_content = True # Enable message content intent
intents.members = True # Enable members intent for user info
# Initialize the bot with a command prefix and intents.
super().__init__(command_prefix='!', intents=intents) # You may not need prefix, just here for flexibility
# self.tree = app_commands.CommandTree(self) # Initialize command tree for slash commands.
async def on_ready(self):
"""Called when the bot is ready."""
try:
print(f'We have logged in as {self.user}')
cmds = await self.tree.sync() # Sync the command tree with discord
except Exception as e:
print(f'Error during bot startup: {e}')
async def on_message(self, message):
"""Called when a message is received."""
try:
if message.author == self.user: # Ignore the bot's messages
return
if message.content.strip().startswith(f'{self.prefix} '):
if self.ack:
try:
await message.reply(
'Starting browser use task...',
mention_author=True, # Don't ping the user
)
except Exception as e:
print(f'Error sending start message: {e}')
try:
agent_message = await self.run_agent(message.content.replace(f'{self.prefix} ', '').strip())
await message.channel.send(content=f'{agent_message}', reference=message, mention_author=True)
except Exception as e:
await message.channel.send(
content=f'Error during task execution: {str(e)}',
reference=message,
mention_author=True,
)
except Exception as e:
print(f'Error in message handling: {e}')
# await self.process_commands(message) # Needed to process bot commands
async def run_agent(self, task: str) -> str:
try:
browser = Browser(config=self.browser_config)
agent = Agent(task=(task), llm=self.llm, browser=browser)
result = await agent.run()
agent_message = None
if result.is_done():
agent_message = result.history[-1].result[0].extracted_content
if agent_message is None:
agent_message = 'Oops! Something went wrong while running Browser-Use.'
return agent_message
except Exception as e:
raise Exception(f'Browser-use task failed: {str(e)}')

View file

@ -0,0 +1,72 @@
"""
This examples requires you to have a Discord bot token and the bot already added to a server.
Five Steps to create and invite a Discord bot:
1. Create a Discord Application:
* Go to the Discord Developer Portal: https://discord.com/developers/applications
* Log in to the Discord website.
* Click on "New Application".
* Give the application a name and click "Create".
2. Configure the Bot:
* Navigate to the "Bot" tab on the left side of the screen.
* Make sure "Public Bot" is ticked if you want others to invite your bot.
* Generate your bot token by clicking on "Reset Token", Copy the token and save it securely.
* Do not share the bot token. Treat it like a password. If the token is leaked, regenerate it.
3. Enable Privileged Intents:
* Scroll down to the "Privileged Gateway Intents" section.
* Enable the necessary intents (e.g., "Server Members Intent" and "Message Content Intent").
--> Note: Enabling privileged intents for bots in over 100 guilds requires bot verification. You may need to contact Discord support to enable privileged intents for verified bots.
4. Generate Invite URL:
* Go to "OAuth2" tab and "OAuth2 URL Generator" section.
* Under "scopes", tick the "bot" checkbox.
* Tick the permissions required for your bot to function under Bot Permissions.
* e.g. "Send Messages", "Send Messages in Threads", "Read Message History", "Mention Everyone".
* Copy the generated URL under the "GENERATED URL" section at the bottom.
5. Invite the Bot:
* Paste the URL into your browser.
* Choose a server to invite the bot to.
* Click Authorize.
--> Note: The person adding the bot needs "Manage Server" permissions.
6. Run the code below to start the bot with your bot token.
7. Write e.g. "/bu what's the weather in Tokyo?" to start a browser-use task and get a response inside the Discord channel.
"""
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))))
from dotenv import load_dotenv
load_dotenv()
from langchain_google_genai import ChatGoogleGenerativeAI
from pydantic import SecretStr
from browser_use import BrowserConfig
from examples.integrations.discord.discord_api import DiscordBot
# load credentials from environment variables
bot_token = os.getenv('DISCORD_BOT_TOKEN')
if not bot_token:
raise ValueError('Discord bot token not found in .env file.')
api_key = os.getenv('GOOGLE_API_KEY')
if not api_key:
raise ValueError('GOOGLE_API_KEY is not set')
llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp', api_key=SecretStr(api_key))
bot = DiscordBot(
llm=llm, # required; instance of BaseChatModel
prefix='$bu', # optional; prefix of messages to trigger browser-use, defaults to "$bu"
ack=True, # optional; whether to acknowledge task receipt with a message, defaults to False
browser_config=BrowserConfig(
headless=False
), # optional; useful for changing headless mode or other browser configs, defaults to headless mode
)
bot.run(
token=bot_token, # required; Discord bot token
)

View file

@ -0,0 +1,76 @@
# Slack Integration
Steps to create and configure a Slack bot:
1. Create a Slack App:
* Go to the Slack API: https://api.slack.com/apps
* Click on "Create New App".
* Choose "From scratch" and give your app a name and select the workspace.
* Provide a name and description for your bot (these are required fields).
2. Configure the Bot:
* Navigate to the "OAuth & Permissions" tab on the left side of the screen.
* Under "Scopes", add the necessary bot token scopes (add these "chat:write", "channels:history", "im:history").
3. Enable Event Subscriptions:
* Navigate to the "Event Subscriptions" tab.
* Enable events and add the necessary bot events (add these "message.channels", "message.im").
* Add your request URL (you can use ngrok to expose your local server if needed). [See how to set up ngrok](#installing-and-starting-ngrok).
* **Note:** The URL provided by ngrok is ephemeral and will change each time ngrok is started. You will need to update the request URL in the bot's settings each time you restart ngrok. [See how to update the request URL](#updating-the-request-url-in-bots-settings).
4. Add the bot to your Slack workspace:
* Navigate to the "OAuth & Permissions" tab.
* Under "OAuth Tokens for Your Workspace", click on "Install App to Workspace".
* Follow the prompts to authorize the app and add it to your workspace.
5. Set up environment variables:
* Obtain the `SLACK_SIGNING_SECRET`:
* Go to the Slack API: https://api.slack.com/apps
* Select your app.
* Navigate to the "Basic Information" tab.
* Copy the "Signing Secret".
* Obtain the `SLACK_BOT_TOKEN`:
* Go to the Slack API: https://api.slack.com/apps
* Select your app.
* Navigate to the "OAuth & Permissions" tab.
* Copy the "Bot User OAuth Token".
* Create a `.env` file in the root directory of your project and add the following lines:
```env
SLACK_SIGNING_SECRET=your-signing-secret
SLACK_BOT_TOKEN=your-bot-token
```
6. Invite the bot to a channel:
* Use the `/invite @your-bot-name` command in the Slack channel where you want the bot to be active.
7. Run the code in `examples/slack_example.py` to start the bot with your bot token and signing secret.
8. Write e.g. "$bu what's the weather in Tokyo?" to start a browser-use task and get a response inside the Slack channel.
## Installing and Starting ngrok
To expose your local server to the internet, you can use ngrok. Follow these steps to install and start ngrok:
1. Download ngrok from the official website: https://ngrok.com/download
2. Create a free account and follow the official steps to install ngrok.
3. Start ngrok by running the following command in your terminal:
```sh
ngrok http 3000
```
Replace `3000` with the port number your local server is running on.
## Updating the Request URL in Bot's Settings
If you need to update the request URL (e.g., when the ngrok URL changes), follow these steps:
1. Go to the Slack API: https://api.slack.com/apps
2. Select your app.
3. Navigate to the "Event Subscriptions" tab.
4. Update the "Request URL" field with the new ngrok URL. The URL should be something like: `https://<ngrok-id>.ngrok-free.app/slack/events`
5. Save the changes.
## Installing Required Packages
To run this example, you need to install the following packages:
- `fastapi`
- `uvicorn`
- `slack_sdk`
You can install these packages using pip:
```sh
pip install fastapi uvicorn slack_sdk

View file

@ -0,0 +1,130 @@
import logging
import os
import sys
from typing import Annotated
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from fastapi import Depends, FastAPI, HTTPException, Request
from langchain_core.language_models.chat_models import BaseChatModel
from slack_sdk.errors import SlackApiError
from slack_sdk.signature import SignatureVerifier
from slack_sdk.web.async_client import AsyncWebClient
from browser_use import BrowserConfig
from browser_use.agent.service import Agent, Browser
from browser_use.logging_config import setup_logging
setup_logging()
logger = logging.getLogger('slack')
app = FastAPI()
class SlackBot:
def __init__(
self,
llm: BaseChatModel,
bot_token: str,
signing_secret: str,
ack: bool = False,
browser_config: BrowserConfig = BrowserConfig(headless=True),
):
if not bot_token or not signing_secret:
raise ValueError('Bot token and signing secret must be provided')
self.llm = llm
self.ack = ack
self.browser_config = browser_config
self.client = AsyncWebClient(token=bot_token)
self.signature_verifier = SignatureVerifier(signing_secret)
self.processed_events = set()
logger.info('SlackBot initialized')
async def handle_event(self, event, event_id):
try:
logger.info(f'Received event id: {event_id}')
if not event_id:
logger.warning('Event ID missing in event data')
return
if event_id in self.processed_events:
logger.info(f'Event {event_id} already processed')
return
self.processed_events.add(event_id)
if 'subtype' in event and event['subtype'] == 'bot_message':
return
text = event.get('text')
user_id = event.get('user')
if text and text.startswith('$bu '):
task = text[len('$bu ') :].strip()
if self.ack:
try:
await self.send_message(
event['channel'], f'<@{user_id}> Starting browser use task...', thread_ts=event.get('ts')
)
except Exception as e:
logger.error(f'Error sending start message: {e}')
try:
agent_message = await self.run_agent(task)
await self.send_message(event['channel'], f'<@{user_id}> {agent_message}', thread_ts=event.get('ts'))
except Exception as e:
await self.send_message(event['channel'], f'Error during task execution: {str(e)}', thread_ts=event.get('ts'))
except Exception as e:
logger.error(f'Error in handle_event: {str(e)}')
async def run_agent(self, task: str) -> str:
try:
browser = Browser(config=self.browser_config)
agent = Agent(task=task, llm=self.llm, browser=browser)
result = await agent.run()
agent_message = None
if result.is_done():
agent_message = result.history[-1].result[0].extracted_content
if agent_message is None:
agent_message = 'Oops! Something went wrong while running Browser-Use.'
return agent_message
except Exception as e:
logger.error(f'Error during task execution: {str(e)}')
return f'Error during task execution: {str(e)}'
async def send_message(self, channel, text, thread_ts=None):
try:
await self.client.chat_postMessage(channel=channel, text=text, thread_ts=thread_ts)
except SlackApiError as e:
logger.error(f'Error sending message: {e.response["error"]}')
@app.post('/slack/events')
async def slack_events(request: Request, slack_bot: Annotated[SlackBot, Depends()]):
try:
if not slack_bot.signature_verifier.is_valid_request(await request.body(), dict(request.headers)):
logger.warning('Request verification failed')
raise HTTPException(status_code=400, detail='Request verification failed')
event_data = await request.json()
logger.info(f'Received event data: {event_data}')
if 'challenge' in event_data:
return {'challenge': event_data['challenge']}
if 'event' in event_data:
try:
await slack_bot.handle_event(event_data.get('event'), event_data.get('event_id'))
except Exception as e:
logger.error(f'Error handling event: {str(e)}')
return {}
except Exception as e:
logger.error(f'Error in slack_events: {str(e)}')
raise HTTPException(status_code=500, detail='Internal Server Error')

View file

@ -0,0 +1,46 @@
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_google_genai import ChatGoogleGenerativeAI
from pydantic import SecretStr
from browser_use import BrowserConfig
from examples.integrations.slack.slack_api import SlackBot, app
# load credentials from environment variables
bot_token = os.getenv('SLACK_BOT_TOKEN')
if not bot_token:
raise ValueError('Slack bot token not found in .env file.')
signing_secret = os.getenv('SLACK_SIGNING_SECRET')
if not signing_secret:
raise ValueError('Slack signing secret not found in .env file.')
api_key = os.getenv('GOOGLE_API_KEY')
if not api_key:
raise ValueError('GOOGLE_API_KEY is not set')
llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp', api_key=SecretStr(api_key))
slack_bot = SlackBot(
llm=llm, # required; instance of BaseChatModel
bot_token=bot_token, # required; Slack bot token
signing_secret=signing_secret, # required; Slack signing secret
ack=True, # optional; whether to acknowledge task receipt with a message, defaults to False
browser_config=BrowserConfig(
headless=True
), # optional; useful for changing headless mode or other browser configs, defaults to headless mode
)
app.dependency_overrides[SlackBot] = lambda: slack_bot
if __name__ == '__main__':
import uvicorn
uvicorn.run('integrations.slack.slack_api:app', host='0.0.0.0', port=3000)

View file

@ -0,0 +1,2 @@
# Gemini
Detailed video on how to integrate browser-use with Gemini: https://www.youtube.com/watch?v=JluZiWBV_Tc

View file

@ -0,0 +1,42 @@
# Optional: Disable telemetry
# os.environ["ANONYMIZED_TELEMETRY"] = "false"
# Optional: Set the OLLAMA host to a remote server
# os.environ["OLLAMA_HOST"] = "http://x.x.x.x:11434"
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_ollama import ChatOllama
from browser_use import Agent
from browser_use.agent.views import AgentHistoryList
async def run_search() -> AgentHistoryList:
agent = Agent(
task="Search for a 'browser use' post on the r/LocalLLaMA subreddit and open it.",
llm=ChatOllama(
model='qwen2.5:32b-instruct-q4_K_M',
num_ctx=32000,
),
)
result = await agent.run()
return result
async def main():
result = await run_search()
print('\n\n', result)
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,49 @@
"""
Simple try of the agent.
@dev You need to add AZURE_OPENAI_KEY and AZURE_OPENAI_ENDPOINT to your environment variables.
"""
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import AzureChatOpenAI
from browser_use import Agent
# Retrieve Azure-specific environment variables
azure_openai_api_key = os.getenv('AZURE_OPENAI_KEY')
azure_openai_endpoint = os.getenv('AZURE_OPENAI_ENDPOINT')
if not azure_openai_api_key or not azure_openai_endpoint:
raise ValueError('AZURE_OPENAI_KEY or AZURE_OPENAI_ENDPOINT is not set')
# Initialize the Azure OpenAI client
llm = AzureChatOpenAI(
model_name='gpt-4o',
openai_api_key=azure_openai_api_key,
azure_endpoint=azure_openai_endpoint, # Corrected to use azure_endpoint instead of openai_api_base
deployment_name='gpt-4o', # Use deployment_name for Azure models
api_version='2024-08-01-preview', # Explicitly set the API version here
)
agent = Agent(
task='Go to amazon.com, search for laptop, sort by best rating, and give me the price of the first result',
llm=llm,
enable_memory=True,
)
async def main():
await agent.run(max_steps=10)
input('Press Enter to continue...')
asyncio.run(main())

View file

@ -0,0 +1,75 @@
"""
Automated news analysis and sentiment scoring using Bedrock.
Ensure you have browser-use installed with `examples` extra, i.e. `uv install 'browser-use[examples]'`
@dev Ensure AWS environment variables are set correctly for Bedrock access.
"""
import argparse
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
import boto3
from botocore.config import Config
from langchain_aws import ChatBedrockConverse
from browser_use import Agent
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.controller.service import Controller
def get_llm():
config = Config(retries={'max_attempts': 10, 'mode': 'adaptive'})
bedrock_client = boto3.client('bedrock-runtime', region_name='us-east-1', config=config)
return ChatBedrockConverse(
model_id='us.anthropic.claude-3-5-sonnet-20241022-v2:0',
temperature=0.0,
max_tokens=None,
client=bedrock_client,
)
# Define the task for the agent
task = (
"Visit cnn.com, navigate to the 'World News' section, and identify the latest headline. "
'Open the first article and summarize its content in 3-4 sentences. '
'Additionally, analyze the sentiment of the article (positive, neutral, or negative) '
'and provide a confidence score for the sentiment. Present the result in a tabular format.'
)
parser = argparse.ArgumentParser()
parser.add_argument('--query', type=str, help='The query for the agent to execute', default=task)
args = parser.parse_args()
llm = get_llm()
browser = Browser(
config=BrowserConfig(
# browser_binary_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
)
)
agent = Agent(
task=args.query,
llm=llm,
controller=Controller(),
browser=browser,
validate_output=True,
)
async def main():
await agent.run(max_steps=30)
await browser.close()
asyncio.run(main())

View file

@ -0,0 +1,32 @@
"""
Simple script that runs the task of opening amazon and searching.
@dev Ensure we have a `ANTHROPIC_API_KEY` variable in our `.env` file.
"""
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_anthropic import ChatAnthropic
from browser_use import Agent
llm = ChatAnthropic(model_name='claude-3-7-sonnet-20250219', temperature=0.0, timeout=30, stop=None)
agent = Agent(
task='Go to amazon.com, search for laptop, sort by best rating, and give me the price of the first result',
llm=llm,
)
async def main():
await agent.run(max_steps=10)
asyncio.run(main())

View file

@ -0,0 +1,38 @@
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_deepseek import ChatDeepSeek
from pydantic import SecretStr
from browser_use import Agent
api_key = os.getenv('DEEPSEEK_API_KEY', '')
if not api_key:
raise ValueError('DEEPSEEK_API_KEY is not set')
async def run_search():
agent = Agent(
task=('go to amazon.com, search for laptop, sort by best rating, and give me the price of the first result'),
llm=ChatDeepSeek(
base_url='https://api.deepseek.com/v1',
model='deepseek-reasoner',
api_key=SecretStr(api_key),
),
use_vision=False,
max_failures=2,
max_actions_per_step=1,
)
await agent.run()
if __name__ == '__main__':
asyncio.run(run_search())

View file

@ -0,0 +1,41 @@
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_deepseek import ChatDeepSeek
from pydantic import SecretStr
from browser_use import Agent
api_key = os.getenv('DEEPSEEK_API_KEY', '')
if not api_key:
raise ValueError('DEEPSEEK_API_KEY is not set')
async def run_search():
agent = Agent(
task=(
'1. Go to https://www.reddit.com/r/LocalLLaMA '
"2. Search for 'browser use' in the search bar"
'3. Click on first result'
'4. Return the first comment'
),
llm=ChatDeepSeek(
base_url='https://api.deepseek.com/v1',
model='deepseek-chat',
api_key=SecretStr(api_key),
),
use_vision=False,
)
await agent.run()
if __name__ == '__main__':
asyncio.run(run_search())

View file

@ -0,0 +1,45 @@
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_google_genai import ChatGoogleGenerativeAI
from pydantic import SecretStr
from browser_use import Agent, BrowserConfig
from browser_use.browser.browser import Browser
from browser_use.browser.context import BrowserContextConfig
api_key = os.getenv('GOOGLE_API_KEY')
if not api_key:
raise ValueError('GOOGLE_API_KEY is not set')
llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp', api_key=SecretStr(api_key))
browser = Browser(
config=BrowserConfig(
new_context_config=BrowserContextConfig(
viewport_expansion=0,
)
)
)
async def run_search():
agent = Agent(
task='Go to amazon.com, search for laptop, sort by best rating, and give me the price of the first result',
llm=llm,
max_actions_per_step=4,
browser=browser,
)
await agent.run(max_steps=25)
if __name__ == '__main__':
asyncio.run(run_search())

View file

@ -0,0 +1,33 @@
"""
Simple try of the agent.
@dev You need to add OPENAI_API_KEY to your environment variables.
"""
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from browser_use import Agent
llm = ChatOpenAI(model='gpt-4o')
agent = Agent(
task='Go to amazon.com, search for laptop, sort by best rating, and give me the price of the first result',
llm=llm,
)
async def main():
await agent.run(max_steps=10)
input('Press Enter to continue...')
asyncio.run(main())

View file

@ -0,0 +1,41 @@
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from pydantic import SecretStr
from browser_use import Agent
api_key = os.getenv('GROK_API_KEY', '')
if not api_key:
raise ValueError('GROK_API_KEY is not set')
async def run_search():
agent = Agent(
task=(
'1. Go to https://www.amazon.com'
'2. Search for "wireless headphones"'
'3. Filter by "Highest customer rating"'
'4. Return the title and price of the first product'
),
llm=ChatOpenAI(
base_url='https://api.x.ai/v1',
model='grok-3-beta',
api_key=SecretStr(api_key),
),
use_vision=False,
)
await agent.run()
if __name__ == '__main__':
asyncio.run(run_search())

View file

@ -0,0 +1,47 @@
"""
Simple try of the agent.
@dev You need to add NOVITA_API_KEY to your environment variables.
"""
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from pydantic import SecretStr
from browser_use import Agent
api_key = os.getenv('NOVITA_API_KEY', '')
if not api_key:
raise ValueError('NOVITA_API_KEY is not set')
async def run_search():
agent = Agent(
task=(
'1. Go to https://www.reddit.com/r/LocalLLaMA '
"2. Search for 'browser use' in the search bar"
'3. Click on first result'
'4. Return the first comment'
),
llm=ChatOpenAI(
base_url='https://api.novita.ai/v3/openai',
model='deepseek/deepseek-v3-0324',
api_key=SecretStr(api_key),
),
use_vision=False,
)
await agent.run()
if __name__ == '__main__':
asyncio.run(run_search())

View file

@ -0,0 +1,34 @@
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_ollama import ChatOllama
from browser_use import Agent
async def run_search():
agent = Agent(
task=(
"1. Go to https://www.reddit.com/r/LocalLLaMA2. Search for 'browser use' in the search bar3. Click search4. Call done"
),
llm=ChatOllama(
# model='qwen2.5:32b-instruct-q4_K_M',
# model='qwen2.5:14b',
model='qwen2.5:latest',
num_ctx=128000,
),
max_actions_per_step=1,
)
await agent.run()
if __name__ == '__main__':
asyncio.run(run_search())

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,30 @@
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from browser_use import Agent
# Initialize the model
llm = ChatOpenAI(
model='gpt-4o',
temperature=0.0,
)
task = 'Go to kayak.com and find the cheapest flight from Zurich to San Francisco on 2025-05-01'
agent = Agent(task=task, llm=llm)
async def main():
await agent.run()
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,7 @@
# **User Interfaces of Browser-Use**
| **File Name** | **User Interface** | **Description** | **Example Usage** |
|------------------------|-------------------|-------------------------------------------|-------------------------------------------|
| `command_line.py` | **Terminal** | Parses arguments for command-line execution. | `python command_line.py` |
| `gradio_demo.py` | **Gradio** | Provides a Gradio-based interactive UI. | `python gradio_demo.py` |
| `streamlit_demo.py` | **Streamlit** | Runs a Streamlit-based web interface. | `python -m streamlit run streamlit_demo.py` |

View file

@ -0,0 +1,98 @@
"""
To Use It:
Example 1: Using OpenAI (default), with default task: 'go to reddit and search for posts about browser-use'
python command_line.py
Example 2: Using OpenAI with a Custom Query
python command_line.py --query "go to google and search for browser-use"
Example 3: Using Anthropic's Claude Model with a Custom Query
python command_line.py --query "find latest Python tutorials on Medium" --provider anthropic
"""
import argparse
import asyncio
import os
import sys
# Ensure local repository (browser_use) is accessible
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from browser_use import Agent
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.controller.service import Controller
def get_llm(provider: str):
if provider == 'anthropic':
from langchain_anthropic import ChatAnthropic
api_key = os.getenv('ANTHROPIC_API_KEY')
if not api_key:
raise ValueError('Error: ANTHROPIC_API_KEY is not set. Please provide a valid API key.')
return ChatAnthropic(model_name='claude-3-5-sonnet-20240620', timeout=25, stop=None, temperature=0.0)
elif provider == 'openai':
from langchain_openai import ChatOpenAI
api_key = os.getenv('OPENAI_API_KEY')
if not api_key:
raise ValueError('Error: OPENAI_API_KEY is not set. Please provide a valid API key.')
return ChatOpenAI(model='gpt-4o', temperature=0.0)
else:
raise ValueError(f'Unsupported provider: {provider}')
def parse_arguments():
"""Parse command-line arguments."""
parser = argparse.ArgumentParser(description='Automate browser tasks using an LLM agent.')
parser.add_argument(
'--query', type=str, help='The query to process', default='go to reddit and search for posts about browser-use'
)
parser.add_argument(
'--provider',
type=str,
choices=['openai', 'anthropic'],
default='openai',
help='The model provider to use (default: openai)',
)
return parser.parse_args()
def initialize_agent(query: str, provider: str):
"""Initialize the browser agent with the given query and provider."""
llm = get_llm(provider)
controller = Controller()
browser = Browser(config=BrowserConfig())
return Agent(
task=query,
llm=llm,
controller=controller,
browser=browser,
use_vision=True,
max_actions_per_step=1,
), browser
async def main():
"""Main async function to run the agent."""
args = parse_arguments()
agent, browser = initialize_agent(args.query, args.provider)
await agent.run(max_steps=25)
input('Press Enter to close the browser...')
await browser.close()
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,109 @@
import asyncio
import os
import sys
from dataclasses import dataclass
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
# Third-party imports
import gradio as gr
from langchain_openai import ChatOpenAI
from rich.console import Console
from rich.panel import Panel
from rich.text import Text
# Local module imports
from browser_use import Agent
@dataclass
class ActionResult:
is_done: bool
extracted_content: str | None
error: str | None
include_in_memory: bool
@dataclass
class AgentHistoryList:
all_results: list[ActionResult]
all_model_outputs: list[dict]
def parse_agent_history(history_str: str) -> None:
console = Console()
# Split the content into sections based on ActionResult entries
sections = history_str.split('ActionResult(')
for i, section in enumerate(sections[1:], 1): # Skip first empty section
# Extract relevant information
content = ''
if 'extracted_content=' in section:
content = section.split('extracted_content=')[1].split(',')[0].strip("'")
if content:
header = Text(f'Step {i}', style='bold blue')
panel = Panel(content, title=header, border_style='blue')
console.print(panel)
console.print()
async def run_browser_task(
task: str,
api_key: str,
model: str = 'gpt-4o',
headless: bool = True,
) -> str:
if not api_key.strip():
return 'Please provide an API key'
os.environ['OPENAI_API_KEY'] = api_key
try:
agent = Agent(
task=task,
llm=ChatOpenAI(model='gpt-4o'),
)
result = await agent.run()
# TODO: The result cloud be parsed better
return result
except Exception as e:
return f'Error: {str(e)}'
def create_ui():
with gr.Blocks(title='Browser Use GUI') as interface:
gr.Markdown('# Browser Use Task Automation')
with gr.Row():
with gr.Column():
api_key = gr.Textbox(label='OpenAI API Key', placeholder='sk-...', type='password')
task = gr.Textbox(
label='Task Description',
placeholder='E.g., Find flights from New York to London for next week',
lines=3,
)
model = gr.Dropdown(choices=['gpt-4', 'gpt-3.5-turbo'], label='Model', value='gpt-4')
headless = gr.Checkbox(label='Run Headless', value=True)
submit_btn = gr.Button('Run Task')
with gr.Column():
output = gr.Textbox(label='Output', lines=10, interactive=False)
submit_btn.click(
fn=lambda *args: asyncio.run(run_browser_task(*args)),
inputs=[task, api_key, model, headless],
outputs=output,
)
return interface
if __name__ == '__main__':
demo = create_ui()
demo.launch()

View file

@ -0,0 +1,86 @@
"""
To use it, you'll need to install streamlit, and run with:
python -m streamlit run streamlit_demo.py
"""
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
import streamlit as st
from browser_use import Agent
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.controller.service import Controller
if os.name == 'nt':
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
# Function to get the LLM based on provider
def get_llm(provider: str):
if provider == 'anthropic':
from langchain_anthropic import ChatAnthropic
api_key = os.getenv('ANTHROPIC_API_KEY')
if not api_key:
st.error('Error: ANTHROPIC_API_KEY is not set. Please provide a valid API key.')
st.stop()
return ChatAnthropic(model_name='claude-3-5-sonnet-20240620', timeout=25, stop=None, temperature=0.0)
elif provider == 'openai':
from langchain_openai import ChatOpenAI
api_key = os.getenv('OPENAI_API_KEY')
if not api_key:
st.error('Error: OPENAI_API_KEY is not set. Please provide a valid API key.')
st.stop()
return ChatOpenAI(model='gpt-4o', temperature=0.0)
else:
st.error(f'Unsupported provider: {provider}')
st.stop()
# Function to initialize the agent
def initialize_agent(query: str, provider: str):
llm = get_llm(provider)
controller = Controller()
browser = Browser(config=BrowserConfig())
return Agent(
task=query,
llm=llm,
controller=controller,
browser=browser,
use_vision=True,
max_actions_per_step=1,
), browser
# Streamlit UI
st.title('Automated Browser Agent with LLMs 🤖')
query = st.text_input('Enter your query:', 'go to reddit and search for posts about browser-use')
provider = st.radio('Select LLM Provider:', ['openai', 'anthropic'], index=0)
if st.button('Run Agent'):
st.write('Initializing agent...')
agent, browser = initialize_agent(query, provider)
async def run_agent():
with st.spinner('Running automation...'):
await agent.run(max_steps=25)
st.success('Task completed! 🎉')
asyncio.run(run_agent())
st.button('Close Browser', on_click=lambda: asyncio.run(browser.close()))

View file

@ -0,0 +1,12 @@
# Use Cases of Browser-Use
| File Name | Description |
|-----------|------------|
| `captcha.py` | Automates CAPTCHA solving on a demo website. |
| `check_appointment.py` | Checks for available visa appointment slots on the Greece MFA website. |
| `find_and_apply_to_jobs.py` | Searches for job listings, evaluates relevance based on a CV, and applies automatically. |
| `online_coding_agent.py` | Implements a multi-agent system for online code editors, with separate agents for coding and execution. |
| `post-twitter.py` | Provides a template for automated posting on X (Twitter), including new tweets, tagging, and replies. |
| `scrolling_page.py` | Automates webpage scrolling with various scrolling actions and text search functionality. |
| `twitter_post_using_cookies.py` | Automates posting on X (Twitter) using stored authentication cookies. |
| `web_voyager_agent.py` | A general-purpose web navigation agent for tasks like flight booking and course searching. |

View file

@ -0,0 +1,40 @@
"""
Goal: Automates CAPTCHA solving on a demo website.
Simple try of the agent.
@dev You need to add OPENAI_API_KEY to your environment variables.
NOTE: captchas are hard. For this example it works. But e.g. for iframes it does not.
for this example it helps to zoom in.
"""
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from browser_use import Agent
if not os.getenv('OPENAI_API_KEY'):
raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.')
async def main():
llm = ChatOpenAI(model='gpt-4o')
agent = Agent(
task='go to https://captcha.com/demos/features/captcha-demo.aspx and solve the captcha',
llm=llm,
)
await agent.run()
input('Press Enter to exit')
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,52 @@
# Goal: Checks for available visa appointment slots on the Greece MFA website.
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, SecretStr
from browser_use.agent.service import Agent
from browser_use.controller.service import Controller
if not os.getenv('OPENAI_API_KEY'):
raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.')
controller = Controller()
class WebpageInfo(BaseModel):
"""Model for webpage link."""
link: str = 'https://appointment.mfa.gr/en/reservations/aero/ireland-grcon-dub/'
@controller.action('Go to the webpage', param_model=WebpageInfo)
def go_to_webpage(webpage_info: WebpageInfo):
"""Returns the webpage link."""
return webpage_info.link
async def main():
"""Main function to execute the agent task."""
task = (
'Go to the Greece MFA webpage via the link I provided you.'
'Check the visa appointment dates. If there is no available date in this month, check the next month.'
'If there is no available date in both months, tell me there is no available date.'
)
model = ChatOpenAI(model='gpt-4o-mini', api_key=SecretStr(os.getenv('OPENAI_API_KEY', '')))
agent = Agent(task, model, controller=controller, use_vision=True)
await agent.run()
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,160 @@
"""
Goal: Searches for job listings, evaluates relevance based on a CV, and applies
@dev You need to add OPENAI_API_KEY to your environment variables.
Also you have to install PyPDF2 to read pdf files: pip install PyPDF2
"""
import asyncio
import csv
import logging
import os
import sys
from pathlib import Path
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import AzureChatOpenAI
from pydantic import BaseModel, SecretStr
from PyPDF2 import PdfReader
from browser_use import ActionResult, Agent, Controller
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import BrowserContext
required_env_vars = ['AZURE_OPENAI_KEY', 'AZURE_OPENAI_ENDPOINT']
for var in required_env_vars:
if not os.getenv(var):
raise ValueError(f'{var} is not set. Please add it to your environment variables.')
logger = logging.getLogger(__name__)
# full screen mode
controller = Controller()
# NOTE: This is the path to your cv file
CV = Path.cwd() / 'cv_04_24.pdf'
if not CV.exists():
raise FileNotFoundError(f'You need to set the path to your cv file in the CV variable. CV file not found at {CV}')
class Job(BaseModel):
title: str
link: str
company: str
fit_score: float
location: str | None = None
salary: str | None = None
@controller.action('Save jobs to file - with a score how well it fits to my profile', param_model=Job)
def save_jobs(job: Job):
with open('jobs.csv', 'a', newline='') as f:
writer = csv.writer(f)
writer.writerow([job.title, job.company, job.link, job.salary, job.location])
return 'Saved job to file'
@controller.action('Read jobs from file')
def read_jobs():
with open('jobs.csv') as f:
return f.read()
@controller.action('Read my cv for context to fill forms')
def read_cv():
pdf = PdfReader(CV)
text = ''
for page in pdf.pages:
text += page.extract_text() or ''
logger.info(f'Read cv with {len(text)} characters')
return ActionResult(extracted_content=text, include_in_memory=True)
@controller.action(
'Upload cv to element - call this function to upload if element is not found, try with different index of the same upload element',
)
async def upload_cv(index: int, browser: BrowserContext):
path = str(CV.absolute())
dom_el = await browser.get_dom_element_by_index(index)
if dom_el is None:
return ActionResult(error=f'No element found at index {index}')
file_upload_dom_el = dom_el.get_file_upload_element()
if file_upload_dom_el is None:
logger.info(f'No file upload element found at index {index}')
return ActionResult(error=f'No file upload element found at index {index}')
file_upload_el = await browser.get_locate_element(file_upload_dom_el)
if file_upload_el is None:
logger.info(f'No file upload element found at index {index}')
return ActionResult(error=f'No file upload element found at index {index}')
try:
await file_upload_el.set_input_files(path)
msg = f'Successfully uploaded file "{path}" to index {index}'
logger.info(msg)
return ActionResult(extracted_content=msg)
except Exception as e:
logger.debug(f'Error in set_input_files: {str(e)}')
return ActionResult(error=f'Failed to upload file to index {index}')
browser = Browser(
config=BrowserConfig(
browser_binary_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
disable_security=True,
)
)
async def main():
# ground_task = (
# 'You are a professional job finder. '
# '1. Read my cv with read_cv'
# '2. Read the saved jobs file '
# '3. start applying to the first link of Amazon '
# 'You can navigate through pages e.g. by scrolling '
# 'Make sure to be on the english version of the page'
# )
ground_task = (
'You are a professional job finder. '
'1. Read my cv with read_cv'
'find ml internships in and save them to a file'
'search at company:'
)
tasks = [
ground_task + '\n' + 'Google',
# ground_task + '\n' + 'Amazon',
# ground_task + '\n' + 'Apple',
# ground_task + '\n' + 'Microsoft',
# ground_task
# + '\n'
# + 'go to https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite/job/Taiwan%2C-Remote/Fulfillment-Analyst---New-College-Graduate-2025_JR1988949/apply/autofillWithResume?workerSubType=0c40f6bd1d8f10adf6dae42e46d44a17&workerSubType=ab40a98049581037a3ada55b087049b7 NVIDIA',
# ground_task + '\n' + 'Meta',
]
model = AzureChatOpenAI(
model='gpt-4o',
api_version='2024-10-21',
azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT', ''),
api_key=SecretStr(os.getenv('AZURE_OPENAI_KEY', '')),
)
agents = []
for task in tasks:
agent = Agent(task=task, llm=model, controller=controller, browser=browser)
agents.append(agent)
await asyncio.gather(*[agent.run() for agent in agents])
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,90 @@
"""
Show how to use custom outputs.
@dev You need to add OPENAI_API_KEY to your environment variables.
"""
import asyncio
import json
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
import httpx
from langchain_openai import ChatOpenAI
from pydantic import BaseModel
from browser_use import Agent, Controller
from browser_use.agent.views import ActionResult
class Profile(BaseModel):
platform: str
profile_url: str
class Profiles(BaseModel):
profiles: list[Profile]
controller = Controller(exclude_actions=['search_google'], output_model=Profiles)
BEARER_TOKEN = os.getenv('BEARER_TOKEN')
if not BEARER_TOKEN:
# use the api key for ask tessa
# you can also use other apis like exa, xAI, perplexity, etc.
raise ValueError('BEARER_TOKEN is not set - go to https://www.heytessa.ai/ and create an api key')
@controller.registry.action('Search the web for a specific query')
async def search_web(query: str):
keys_to_use = ['url', 'title', 'content', 'author', 'score']
headers = {'Authorization': f'Bearer {BEARER_TOKEN}'}
async with httpx.AsyncClient() as client:
response = await client.post(
'https://asktessa.ai/api/search',
headers=headers,
json={'query': query},
)
final_results = [
{key: source[key] for key in keys_to_use if key in source}
for source in await response.json()['sources']
if source['score'] >= 0.2
]
# print(json.dumps(final_results, indent=4))
result_text = json.dumps(final_results, indent=4)
print(result_text)
return ActionResult(extracted_content=result_text, include_in_memory=True)
async def main():
task = (
'Go to this tiktok video url, open it and extract the @username from the resulting url. Then do a websearch for this username to find all his social media profiles. Return me the links to the social media profiles with the platform name.'
' https://www.tiktokv.com/share/video/7470981717659110678/ '
)
model = ChatOpenAI(model='gpt-4o')
agent = Agent(task=task, llm=model, controller=controller)
history = await agent.run()
result = history.final_result()
if result:
parsed: Profiles = Profiles.model_validate_json(result)
for profile in parsed.profiles:
print('\n--------------------------------')
print(f'Platform: {profile.platform}')
print(f'Profile URL: {profile.profile_url}')
else:
print('No result')
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,193 @@
import os
import sys
from browser_use.browser.context import BrowserContext
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import asyncio
import pyperclip
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from browser_use import ActionResult, Agent, Controller
from browser_use.browser.browser import Browser, BrowserConfig
browser = Browser(
config=BrowserConfig(
browser_binary_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
),
)
# Load environment variables
load_dotenv()
if not os.getenv('OPENAI_API_KEY'):
raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.')
controller = Controller()
def is_google_sheet(page) -> bool:
return page.url.startswith('https://docs.google.com/spreadsheets/')
@controller.registry.action('Google Sheets: Open a specific Google Sheet')
async def open_google_sheet(browser: BrowserContext, google_sheet_url: str):
page = await browser.get_current_page()
if page.url != google_sheet_url:
await page.goto(google_sheet_url)
await page.wait_for_load_state()
if not is_google_sheet(page):
return ActionResult(error='Failed to open Google Sheet, are you sure you have permissions to access this sheet?')
return ActionResult(extracted_content=f'Opened Google Sheet {google_sheet_url}', include_in_memory=False)
@controller.registry.action('Google Sheets: Get the contents of the entire sheet', page_filter=is_google_sheet)
async def get_sheet_contents(browser: BrowserContext):
page = await browser.get_current_page()
# select all cells
await page.keyboard.press('Enter')
await page.keyboard.press('Escape')
await page.keyboard.press('ControlOrMeta+A')
await page.keyboard.press('ControlOrMeta+C')
extracted_tsv = pyperclip.paste()
return ActionResult(extracted_content=extracted_tsv, include_in_memory=True)
@controller.registry.action('Google Sheets: Select a specific cell or range of cells', page_filter=is_google_sheet)
async def select_cell_or_range(browser: BrowserContext, cell_or_range: str):
page = await browser.get_current_page()
await page.keyboard.press('Enter') # make sure we dont delete current cell contents if we were last editing
await page.keyboard.press('Escape') # to clear current focus (otherwise select range popup is additive)
await asyncio.sleep(0.1)
await page.keyboard.press('Home') # move cursor to the top left of the sheet first
await page.keyboard.press('ArrowUp')
await asyncio.sleep(0.1)
await page.keyboard.press('Control+G') # open the goto range popup
await asyncio.sleep(0.2)
await page.keyboard.type(cell_or_range, delay=0.05)
await asyncio.sleep(0.2)
await page.keyboard.press('Enter')
await asyncio.sleep(0.2)
await page.keyboard.press('Escape') # to make sure the popup still closes in the case where the jump failed
return ActionResult(extracted_content=f'Selected cell {cell_or_range}', include_in_memory=False)
@controller.registry.action('Google Sheets: Get the contents of a specific cell or range of cells', page_filter=is_google_sheet)
async def get_range_contents(browser: BrowserContext, cell_or_range: str):
page = await browser.get_current_page()
await select_cell_or_range(browser, cell_or_range)
await page.keyboard.press('ControlOrMeta+C')
await asyncio.sleep(0.1)
extracted_tsv = pyperclip.paste()
return ActionResult(extracted_content=extracted_tsv, include_in_memory=True)
@controller.registry.action('Google Sheets: Clear the currently selected cells', page_filter=is_google_sheet)
async def clear_selected_range(browser: BrowserContext):
page = await browser.get_current_page()
await page.keyboard.press('Backspace')
return ActionResult(extracted_content='Cleared selected range', include_in_memory=False)
@controller.registry.action('Google Sheets: Input text into the currently selected cell', page_filter=is_google_sheet)
async def input_selected_cell_text(browser: BrowserContext, text: str):
page = await browser.get_current_page()
await page.keyboard.type(text, delay=0.1)
await page.keyboard.press('Enter') # make sure to commit the input so it doesn't get overwritten by the next action
await page.keyboard.press('ArrowUp')
return ActionResult(extracted_content=f'Inputted text {text}', include_in_memory=False)
@controller.registry.action('Google Sheets: Batch update a range of cells', page_filter=is_google_sheet)
async def update_range_contents(browser: BrowserContext, range: str, new_contents_tsv: str):
page = await browser.get_current_page()
await select_cell_or_range(browser, range)
# simulate paste event from clipboard with TSV content
await page.evaluate(f"""
const clipboardData = new DataTransfer();
clipboardData.setData('text/plain', `{new_contents_tsv}`);
document.activeElement.dispatchEvent(new ClipboardEvent('paste', {{clipboardData}}));
""")
return ActionResult(extracted_content=f'Updated cell {range} with {new_contents_tsv}', include_in_memory=False)
# many more snippets for keyboard-shortcut based Google Sheets automation can be found here, see:
# - https://github.com/philc/sheetkeys/blob/master/content_scripts/sheet_actions.js
# - https://github.com/philc/sheetkeys/blob/master/content_scripts/commands.js
# - https://support.google.com/docs/answer/181110?hl=en&co=GENIE.Platform%3DDesktop#zippy=%2Cmac-shortcuts
# Tip: LLM is bad at spatial reasoning, don't make it navigate with arrow keys relative to current cell
# if given arrow keys, it will try to jump from G1 to A2 by pressing Down, without realizing needs to go Down+LeftLeftLeftLeft
async def main():
async with await browser.new_context() as context:
model = ChatOpenAI(model='gpt-4o')
eraser = Agent(
task="""
Clear all the existing values in columns A through F in this Google Sheet:
https://docs.google.com/spreadsheets/d/1INaIcfpYXlMRWO__de61SHFCaqt1lfHlcvtXZPItlpI/edit
""",
llm=model,
browser_context=context,
controller=controller,
)
await eraser.run()
researcher = Agent(
task="""
Google to find the full name, nationality, and date of birth of the CEO of the top 10 Fortune 100 companies.
For each company, append a row to this existing Google Sheet: https://docs.google.com/spreadsheets/d/1INaIcfpYXlMRWO__de61SHFCaqt1lfHlcvtXZPItlpI/edit
Make sure column headers are present and all existing values in the sheet are formatted correctly.
Columns:
A: Company Name
B: CEO Full Name
C: CEO Country of Birth
D: CEO Date of Birth (YYYY-MM-DD)
E: Source URL where the information was found
""",
llm=model,
browser_context=context,
controller=controller,
)
await researcher.run()
improvised_continuer = Agent(
task="""
Read the Google Sheet https://docs.google.com/spreadsheets/d/1INaIcfpYXlMRWO__de61SHFCaqt1lfHlcvtXZPItlpI/edit
Add 3 more rows to the bottom continuing the existing pattern, make sure any data you add is sourced correctly.
""",
llm=model,
browser_context=context,
controller=controller,
)
await improvised_continuer.run()
final_fact_checker = Agent(
task="""
Read the Google Sheet https://docs.google.com/spreadsheets/d/1INaIcfpYXlMRWO__de61SHFCaqt1lfHlcvtXZPItlpI/edit
Fact-check every entry, add a new column F with your findings for each row.
Make sure to check the source URL for each row, and make sure the information is correct.
""",
llm=model,
browser_context=context,
controller=controller,
)
await final_fact_checker.run()
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,49 @@
# Goal: Implements a multi-agent system for online code editors, with separate agents for coding and execution.
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from browser_use import Agent, Browser
if not os.getenv('OPENAI_API_KEY'):
raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.')
async def main():
browser = Browser()
async with await browser.new_context() as context:
model = ChatOpenAI(model='gpt-4o')
# Initialize browser agent
agent1 = Agent(
task='Open an online code editor programiz.',
llm=model,
browser_context=context,
)
executor = Agent(
task='Executor. Execute the code written by the coder and suggest some updates if there are errors.',
llm=model,
browser_context=context,
)
coder = Agent(
task='Coder. Your job is to write and complete code. You are an expert coder. Code a simple calculator. Write the code on the coding interface after agent1 has opened the link.',
llm=model,
browser_context=context,
)
await agent1.run()
await executor.run()
await coder.run()
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,127 @@
"""
Goal: Provides a template for automated posting on X (Twitter), including new tweets, tagging, and replies.
X Posting Template using browser-use
----------------------------------------
This template allows you to automate posting on X using browser-use.
It supports:
- Posting new tweets
- Tagging users
- Replying to tweets
Add your target user and message in the config section.
target_user="XXXXX"
message="XXXXX"
reply_url="XXXXX"
Any issues, contact me on X @defichemist95
"""
import asyncio
import os
import sys
from dataclasses import dataclass
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from browser_use import Agent, Controller
from browser_use.browser.browser import Browser, BrowserConfig
if not os.getenv('OPENAI_API_KEY'):
raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.')
# ============ Configuration Section ============
@dataclass
class TwitterConfig:
"""Configuration for Twitter posting"""
openai_api_key: str
chrome_path: str
target_user: str # Twitter handle without @
message: str
reply_url: str
headless: bool = False
model: str = 'gpt-4o-mini'
base_url: str = 'https://x.com/home'
# Customize these settings
config = TwitterConfig(
openai_api_key=os.getenv('OPENAI_API_KEY'),
chrome_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', # This is for MacOS (Chrome)
target_user='XXXXX',
message='XXXXX',
reply_url='XXXXX',
headless=False,
)
def create_twitter_agent(config: TwitterConfig) -> Agent:
llm = ChatOpenAI(model=config.model, api_key=config.openai_api_key)
browser = Browser(
config=BrowserConfig(
headless=config.headless,
browser_binary_path=config.chrome_path,
)
)
controller = Controller()
# Construct the full message with tag
full_message = f'@{config.target_user} {config.message}'
# Create the agent with detailed instructions
return Agent(
task=f"""Navigate to Twitter and create a post and reply to a tweet.
Here are the specific steps:
1. Go to {config.base_url}. See the text input field at the top of the page that says "What's happening?"
2. Look for the text input field at the top of the page that says "What's happening?"
3. Click the input field and type exactly this message:
"{full_message}"
4. Find and click the "Post" button (look for attributes: 'button' and 'data-testid="tweetButton"')
5. Do not click on the '+' button which will add another tweet.
6. Navigate to {config.reply_url}
7. Before replying, understand the context of the tweet by scrolling down and reading the comments.
8. Reply to the tweet under 50 characters.
Important:
- Wait for each element to load before interacting
- Make sure the message is typed exactly as shown
- Verify the post button is clickable before clicking
- Do not click on the '+' button which will add another tweet
""",
llm=llm,
controller=controller,
browser=browser,
)
async def post_tweet(agent: Agent):
try:
await agent.run(max_steps=100)
agent.create_history_gif()
print('Tweet posted successfully!')
except Exception as e:
print(f'Error posting tweet: {str(e)}')
async def main():
agent = create_twitter_agent(config)
await agent.run()
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,43 @@
# Goal: Automates webpage scrolling with various scrolling actions and text search functionality.
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from browser_use import Agent
from browser_use.browser.browser import Browser, BrowserConfig
if not os.getenv('OPENAI_API_KEY'):
raise ValueError('OPENAI_API_KEY is not set')
"""
Example: Using the 'Scroll down' action.
This script demonstrates how the agent can navigate to a webpage and scroll down the content.
If no amount is specified, the agent will scroll down by one page height.
"""
llm = ChatOpenAI(model='gpt-4o')
agent = Agent(
# task="Navigate to 'https://en.wikipedia.org/wiki/Internet' and scroll down by one page - then scroll up by 100 pixels - then scroll down by 100 pixels - then scroll down by 10000 pixels.",
task="Navigate to 'https://en.wikipedia.org/wiki/Internet' and scroll to the string 'The vast majority of computer'",
llm=llm,
browser=Browser(config=BrowserConfig(headless=False)),
)
async def main():
await agent.run()
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,128 @@
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from browser_use import Agent, Browser
task = """
### Prompt for Shopping Agent Migros Online Grocery Order
**Objective:**
Visit [Migros Online](https://www.migros.ch/en), search for the required grocery items, add them to the cart, select an appropriate delivery window, and complete the checkout process using TWINT.
**Important:**
- Make sure that you don't buy more than it's needed for each article.
- After your search, if you click the "+" button, it adds the item to the basket.
- if you open the basket sidewindow menu, you can close it by clicking the X button on the top right. This will help you navigate easier.
---
### Step 1: Navigate to the Website
- Open [Migros Online](https://www.migros.ch/en).
- You should be logged in as Nikolaos Kaliorakis
---
### Step 2: Add Items to the Basket
#### Shopping List:
**Meat & Dairy:**
- Beef Minced meat (1 kg)
- Gruyère cheese (grated preferably)
- 2 liters full-fat milk
- Butter (cheapest available)
**Vegetables:**
- Carrots (1kg pack)
- Celery
- Leeks (1 piece)
- 1 kg potatoes
At this stage, check the basket on the top right (indicates the price) and check if you bought the right items.
**Fruits:**
- 2 lemons
- Oranges (for snacking)
**Pantry Items:**
- Lasagna sheets
- Tahini
- Tomato paste (below CHF2)
- Black pepper refill (not with the mill)
- 2x 1L Oatly Barista(oat milk)
- 1 pack of eggs (10 egg package)
#### Ingredients I already have (DO NOT purchase):
- Olive oil, garlic, canned tomatoes, dried oregano, bay leaves, salt, chili flakes, flour, nutmeg, cumin.
---
### Step 3: Handling Unavailable Items
- If an item is **out of stock**, find the best alternative.
- Use the following recipe contexts to choose substitutions:
- **Pasta Bolognese & Lasagna:** Minced meat, tomato paste, lasagna sheets, milk (for béchamel), Gruyère cheese.
- **Hummus:** Tahini, chickpeas, lemon juice, olive oil.
- **Chickpea Curry Soup:** Chickpeas, leeks, curry, lemons.
- **Crispy Slow-Cooked Pork Belly with Vegetables:** Potatoes, butter.
- Example substitutions:
- If Gruyère cheese is unavailable, select another semi-hard cheese.
- If Tahini is unavailable, a sesame-based alternative may work.
---
### Step 4: Adjusting for Minimum Order Requirement
- If the total order **is below CHF 99**, add **a liquid soap refill** to reach the minimum. If it;s still you can buy some bread, dark chockolate.
- At this step, check if you have bought MORE items than needed. If the price is more then CHF200, you MUST remove items.
- If an item is not available, choose an alternative.
- if an age verification is needed, remove alcoholic products, we haven't verified yet.
---
### Step 5: Select Delivery Window
- Choose a **delivery window within the current week**. It's ok to pay up to CHF2 for the window selection.
- Preferably select a slot within the workweek.
---
### Step 6: Checkout
- Proceed to checkout.
- Select **TWINT** as the payment method.
- Check out.
-
- if it's needed the username is: nikoskalio.dev@gmail.com
- and the password is : TheCircuit.Migros.dev!
---
### Step 7: Confirm Order & Output Summary
- Once the order is placed, output a summary including:
- **Final list of items purchased** (including any substitutions).
- **Total cost**.
- **Chosen delivery time**.
**Important:** Ensure efficiency and accuracy throughout the process."""
browser = Browser()
agent = Agent(
task=task,
llm=ChatOpenAI(model='gpt-4o'),
browser=browser,
)
async def main():
await agent.run()
input('Press Enter to close the browser...')
await browser.close()
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,48 @@
# Goal: Automates posting on X (Twitter) using stored authentication cookies.
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_google_genai import ChatGoogleGenerativeAI
from pydantic import SecretStr
from browser_use import Agent
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import BrowserContext, BrowserContextConfig
api_key = os.getenv('GOOGLE_API_KEY')
if not api_key:
raise ValueError('GOOGLE_API_KEY is not set')
llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp', api_key=SecretStr(api_key))
browser = Browser(
config=BrowserConfig(
# browser_binary_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
)
)
file_path = os.path.join(os.path.dirname(__file__), 'twitter_cookies.txt')
context = BrowserContext(browser=browser, config=BrowserContextConfig(cookies_file=file_path))
async def main():
agent = Agent(
browser_context=context,
task=('go to https://x.com. write a new post with the text "browser-use ftw", and submit it'),
llm=llm,
max_actions_per_step=4,
)
await agent.run(max_steps=25)
input('Press Enter to close the browser...')
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,77 @@
# Goal: A general-purpose web navigation agent for tasks like flight booking and course searching.
import asyncio
import os
import sys
# Adjust Python path
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import AzureChatOpenAI, ChatOpenAI
from pydantic import SecretStr
from browser_use.agent.service import Agent
from browser_use.browser.browser import Browser, BrowserConfig, BrowserContextConfig
# Set LLM based on defined environment variables
if os.getenv('OPENAI_API_KEY'):
llm = ChatOpenAI(
model='gpt-4o',
)
elif os.getenv('AZURE_OPENAI_KEY') and os.getenv('AZURE_OPENAI_ENDPOINT'):
llm = AzureChatOpenAI(
model='gpt-4o',
api_version='2024-10-21',
azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT', ''),
api_key=SecretStr(os.getenv('AZURE_OPENAI_KEY', '')),
)
else:
raise ValueError('No LLM found. Please set OPENAI_API_KEY or AZURE_OPENAI_KEY and AZURE_OPENAI_ENDPOINT.')
browser = Browser(
config=BrowserConfig(
headless=False, # This is True in production
disable_security=True,
new_context_config=BrowserContextConfig(
disable_security=True,
minimum_wait_page_load_time=1, # 3 on prod
maximum_wait_page_load_time=10, # 20 on prod
# Set no_viewport=False to constrain the viewport to the specified dimensions
# This is useful for specific cases where you need a fixed viewport size
no_viewport=False,
window_width=1280,
window_height=1100,
# trace_path='./tmp/web_voyager_agent',
),
)
)
# TASK = """
# Find the lowest-priced one-way flight from Cairo to Montreal on February 21, 2025, including the total travel time and number of stops. on https://www.google.com/travel/flights/
# """
# TASK = """
# Browse Coursera, which universities offer Master of Advanced Study in Engineering degrees? Tell me what is the latest application deadline for this degree? on https://www.coursera.org/"""
TASK = """
Find and book a hotel in Paris with suitable accommodations for a family of four (two adults and two children) offering free cancellation for the dates of February 14-21, 2025. on https://www.booking.com/
"""
async def main():
agent = Agent(
task=TASK,
llm=llm,
browser=browser,
validate_output=True,
enable_memory=False,
)
history = await agent.run(max_steps=50)
history.save_to_file('./tmp/history.json')
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,39 @@
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from browser_use import Agent
from browser_use.browser.browser import Browser, BrowserConfig, BrowserContextConfig
# video https://preview.screen.studio/share/vuq91Ej8
llm = ChatOpenAI(
model='gpt-4o',
temperature=0.0,
)
task = 'go to https://en.wikipedia.org/wiki/Banana and click on buttons on the wikipedia page to go as fast as possible from banna to Quantum mechanics'
browser = Browser(
config=BrowserConfig(
new_context_config=BrowserContextConfig(
viewport_expansion=-1,
highlight_elements=False,
),
),
)
agent = Agent(task=task, llm=llm, browser=browser, use_vision=False)
async def main():
await agent.run()
if __name__ == '__main__':
asyncio.run(main())