[Add] browser-use and main.py

This commit is contained in:
tv0924@icloud.com 2025-05-18 21:57:54 +09:00
commit 96914d44ac
221 changed files with 30952 additions and 1 deletions

View file

@ -0,0 +1,94 @@
"""
Action filters (domains and page_filter) let you limit actions available to the Agent on a step-by-step/page-by-page basis.
@registry.action(..., domains=['*'], page_filter=lambda page: return True)
async def some_action(browser: BrowserContext):
...
This helps prevent the LLM from deciding to use an action that is not compatible with the current page.
It helps limit decision fatique by scoping actions only to pages where they make sense.
It also helps prevent mis-triggering stateful actions or actions that could break other programs or leak secrets.
For example:
- only run on certain domains @registry.action(..., domains=['example.com', '*.example.com', 'example.co.*']) (supports globs, but no regex)
- only fill in a password on a specific login page url
- only run if this action has not run before on this page (e.g. by looking up the url in a file on disk)
During each step, the agent recalculates the actions available specifically for that page, and informs the LLM.
"""
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from playwright.async_api import Page
from browser_use.agent.service import Agent, Browser, BrowserContext, Controller
# Initialize controller and registry
controller = Controller()
registry = controller.registry
# Action will only be available to Agent on Google domains because of the domain filter
@registry.action(description='Trigger disco mode', domains=['google.com', '*.google.com'])
async def disco_mode(browser: BrowserContext):
page = await browser.get_current_page()
await page.evaluate("""() => {
// define the wiggle animation
document.styleSheets[0].insertRule('@keyframes wiggle { 0% { transform: rotate(0deg); } 50% { transform: rotate(10deg); } 100% { transform: rotate(0deg); } }');
document.querySelectorAll("*").forEach(element => {
element.style.animation = "wiggle 0.5s infinite";
});
}""")
# you can create a custom page filter function that determines if the action should be available for a given page
def is_login_page(page: Page) -> bool:
return 'login' in page.url.lower() or 'signin' in page.url.lower()
# then use it in the action decorator to limit the action to only be available on pages where the filter returns True
@registry.action(description='Use the force, luke', page_filter=is_login_page)
async def use_the_force(browser: BrowserContext):
# this will only ever run on pages that matched the filter
page = await browser.get_current_page()
assert is_login_page(page)
await page.evaluate("""() => { document.querySelector('body').innerHTML = 'These are not the droids you are looking for';}""")
async def main():
"""Main function to run the example"""
browser = Browser()
llm = ChatOpenAI(model_name='gpt-4o')
# Create the agent
agent = Agent( # disco mode will not be triggered on apple.com because the LLM won't be able to see that action available, it should work on Google.com though.
task="""
Go to apple.com and trigger disco mode (if dont know how to do that, then just move on).
Then go to google.com and trigger disco mode.
After that, go to the Google login page and Use the force, luke.
""",
llm=llm,
browser=browser,
controller=controller,
)
# Run the agent
await agent.run(max_steps=10)
# Cleanup
await browser.close()
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,98 @@
import asyncio
import json
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
import httpx
from langchain_openai import ChatOpenAI
from pydantic import BaseModel
from browser_use import ActionResult, Agent, Controller
class Person(BaseModel):
name: str
email: str | None = None
class PersonList(BaseModel):
people: list[Person]
controller = Controller(exclude_actions=['search_google'], output_model=PersonList)
BEARER_TOKEN = os.getenv('BEARER_TOKEN')
if not BEARER_TOKEN:
# use the api key for ask tessa
# you can also use other apis like exa, xAI, perplexity, etc.
raise ValueError('BEARER_TOKEN is not set - go to https://www.heytessa.ai/ and create an api key')
@controller.registry.action('Search the web for a specific query')
async def search_web(query: str):
keys_to_use = ['url', 'title', 'content', 'author', 'score']
headers = {'Authorization': f'Bearer {BEARER_TOKEN}'}
async with httpx.AsyncClient() as client:
response = await client.post('https://asktessa.ai/api/search', headers=headers, json={'query': query})
final_results = [
{key: source[key] for key in keys_to_use if key in source}
for source in response.json()['sources']
if source['score'] >= 0.8
]
# print(json.dumps(final_results, indent=4))
result_text = json.dumps(final_results, indent=4)
print(result_text)
return ActionResult(extracted_content=result_text, include_in_memory=True)
names = [
'Ruedi Aebersold',
'Bernd Bodenmiller',
'Eugene Demler',
'Erich Fischer',
'Pietro Gambardella',
'Matthias Huss',
'Reto Knutti',
'Maksym Kovalenko',
'Antonio Lanzavecchia',
'Maria Lukatskaya',
'Jochen Markard',
'Javier Pérez-Ramírez',
'Federica Sallusto',
'Gisbert Schneider',
'Sonia I. Seneviratne',
'Michael Siegrist',
'Johan Six',
'Tanja Stadler',
'Shinichi Sunagawa',
'Michael Bruce Zimmermann',
]
async def main():
task = 'use search_web with "find email address of the following ETH professor:" for each of the following persons in a list of actions. Finally return the list with name and email if provided'
task += '\n' + '\n'.join(names)
model = ChatOpenAI(model='gpt-4o')
agent = Agent(task=task, llm=model, controller=controller, max_actions_per_step=20)
history = await agent.run()
result = history.final_result()
if result:
parsed: PersonList = PersonList.model_validate_json(result)
for person in parsed.people:
print(f'{person.name} - {person.email}')
else:
print('No result')
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,60 @@
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
import pyperclip
from langchain_openai import ChatOpenAI
from browser_use import Agent, Controller
from browser_use.agent.views import ActionResult
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import BrowserContext
browser = Browser(
config=BrowserConfig(
headless=False,
)
)
controller = Controller()
@controller.registry.action('Copy text to clipboard')
def copy_to_clipboard(text: str):
pyperclip.copy(text)
return ActionResult(extracted_content=text)
@controller.registry.action('Paste text from clipboard')
async def paste_from_clipboard(browser: BrowserContext):
text = pyperclip.paste()
# send text to browser
page = await browser.get_current_page()
await page.keyboard.type(text)
return ActionResult(extracted_content=text)
async def main():
task = 'Copy the text "Hello, world!" to the clipboard, then go to google.com and paste the text'
model = ChatOpenAI(model='gpt-4o')
agent = Agent(
task=task,
llm=model,
controller=controller,
browser=browser,
)
await agent.run()
await browser.close()
input('Press Enter to close...')
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,236 @@
"""
Description: These Python modules are designed to capture detailed
browser usage datafor analysis, with both server and client
components working together to record and store the information.
Author: Carlos A. Planchón
https://github.com/carlosplanchon/
Adapt this code to your needs.
Feedback is appreciated!
"""
#####################
# #
# --- UTILS --- #
# #
#####################
import base64
def b64_to_png(b64_string: str, output_file):
"""
Convert a Base64-encoded string to a PNG file.
:param b64_string: A string containing Base64-encoded data
:param output_file: The path to the output PNG file
"""
with open(output_file, 'wb') as f:
f.write(base64.b64decode(b64_string))
###################################################################
# #
# --- FASTAPI API TO RECORD AND SAVE Browser-Use ACTIVITY --- #
# #
###################################################################
# Save to api.py and run with `python api.py`
# ! pip install uvicorn
# ! pip install fastapi
# ! pip install prettyprinter
import json
from pathlib import Path
import prettyprinter
from fastapi import FastAPI, Request
prettyprinter.install_extras()
app = FastAPI()
@app.post('/post_agent_history_step')
async def post_agent_history_step(request: Request):
data = await request.json()
prettyprinter.cpprint(data)
# Ensure the "recordings" folder exists using pathlib
recordings_folder = Path('recordings')
recordings_folder.mkdir(exist_ok=True)
# Determine the next file number by examining existing .json files
existing_numbers = []
for item in recordings_folder.iterdir():
if item.is_file() and item.suffix == '.json':
try:
file_num = int(item.stem)
existing_numbers.append(file_num)
except ValueError:
# In case the file name isn't just a number
...
if existing_numbers:
next_number = max(existing_numbers) + 1
else:
next_number = 1
# Construct the file path
file_path = recordings_folder / f'{next_number}.json'
# Save the JSON data to the file
with file_path.open('w') as f:
json.dump(data, f, indent=2)
return {'status': 'ok', 'message': f'Saved to {file_path}'}
if __name__ == '__main__':
import uvicorn
uvicorn.run(app, host='0.0.0.0', port=9000)
##############################################################
# #
# --- CLIENT TO RECORD AND SAVE Browser-Use ACTIVITY --- #
# #
##############################################################
"""
pyobjtojson:
A Python library to safely and recursively serialize any Python object
(including Pydantic models and dataclasses) into JSON-ready structures,
gracefully handling circular references.
"""
# ! pip install -U pyobjtojson
# ! pip install -U prettyprinter
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
import requests
from langchain_openai import ChatOpenAI
from pyobjtojson import obj_to_json
from browser_use import Agent
# import prettyprinter
# prettyprinter.install_extras()
def send_agent_history_step(data):
url = 'http://127.0.0.1:9000/post_agent_history_step'
response = requests.post(url, json=data)
return response.json()
async def record_activity(agent_obj):
website_html = None
website_screenshot = None
urls_json_last_elem = None
model_thoughts_last_elem = None
model_outputs_json_last_elem = None
model_actions_json_last_elem = None
extracted_content_json_last_elem = None
print('--- ON_STEP_START HOOK ---')
website_html: str = await agent_obj.browser_context.get_page_html()
website_screenshot: str = await agent_obj.browser_context.take_screenshot()
print('--> History:')
if hasattr(agent_obj, 'state'):
history = agent_obj.state.history
else:
history = None
model_thoughts = obj_to_json(obj=history.model_thoughts(), check_circular=False)
# print("--- MODEL THOUGHTS ---")
if len(model_thoughts) > 0:
model_thoughts_last_elem = model_thoughts[-1]
# prettyprinter.cpprint(model_thoughts_last_elem)
# print("--- MODEL OUTPUT ACTION ---")
model_outputs = agent_obj.state.history.model_outputs()
model_outputs_json = obj_to_json(obj=model_outputs, check_circular=False)
if len(model_outputs_json) > 0:
model_outputs_json_last_elem = model_outputs_json[-1]
# prettyprinter.cpprint(model_outputs_json_last_elem)
# print("--- MODEL INTERACTED ELEM ---")
model_actions = agent_obj.state.history.model_actions()
model_actions_json = obj_to_json(obj=model_actions, check_circular=False)
if len(model_actions_json) > 0:
model_actions_json_last_elem = model_actions_json[-1]
# prettyprinter.cpprint(model_actions_json_last_elem)
# print("--- EXTRACTED CONTENT ---")
extracted_content = agent_obj.state.history.extracted_content()
extracted_content_json = obj_to_json(obj=extracted_content, check_circular=False)
if len(extracted_content_json) > 0:
extracted_content_json_last_elem = extracted_content_json[-1]
# prettyprinter.cpprint(extracted_content_json_last_elem)
# print("--- URLS ---")
urls = agent_obj.state.history.urls()
# prettyprinter.cpprint(urls)
urls_json = obj_to_json(obj=urls, check_circular=False)
if len(urls_json) > 0:
urls_json_last_elem = urls_json[-1]
# prettyprinter.cpprint(urls_json_last_elem)
model_step_summary = {
'website_html': website_html,
'website_screenshot': website_screenshot,
'url': urls_json_last_elem,
'model_thoughts': model_thoughts_last_elem,
'model_outputs': model_outputs_json_last_elem,
'model_actions': model_actions_json_last_elem,
'extracted_content': extracted_content_json_last_elem,
}
print('--- MODEL STEP SUMMARY ---')
# prettyprinter.cpprint(model_step_summary)
send_agent_history_step(data=model_step_summary)
# response = send_agent_history_step(data=history)
# print(response)
# print("--> Website HTML:")
# print(website_html[:200])
# print("--> Website Screenshot:")
# print(website_screenshot[:200])
agent = Agent(
task='Compare the price of gpt-4o and DeepSeek-V3',
llm=ChatOpenAI(model='gpt-4o'),
)
async def run_agent():
try:
await agent.run(on_step_start=record_activity, max_steps=30)
except Exception as e:
print(e)
asyncio.run(run_agent())

View file

@ -0,0 +1,112 @@
import asyncio
import logging
import os
import sys
from pathlib import Path
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
import anyio
from langchain_openai import ChatOpenAI
from browser_use import Agent, Controller
from browser_use.agent.views import ActionResult
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import BrowserContext
logger = logging.getLogger(__name__)
# Initialize controller first
browser = Browser(
config=BrowserConfig(
headless=False,
browser_binary_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
)
)
controller = Controller()
@controller.action(
'Upload file to interactive element with file path ',
)
async def upload_file(index: int, path: str, browser: BrowserContext, available_file_paths: list[str]):
if path not in available_file_paths:
return ActionResult(error=f'File path {path} is not available')
if not os.path.exists(path):
return ActionResult(error=f'File {path} does not exist')
dom_el = await browser.get_dom_element_by_index(index)
file_upload_dom_el = dom_el.get_file_upload_element()
if file_upload_dom_el is None:
msg = f'No file upload element found at index {index}'
logger.info(msg)
return ActionResult(error=msg)
file_upload_el = await browser.get_locate_element(file_upload_dom_el)
if file_upload_el is None:
msg = f'No file upload element found at index {index}'
logger.info(msg)
return ActionResult(error=msg)
try:
await file_upload_el.set_input_files(path)
msg = f'Successfully uploaded file to index {index}'
logger.info(msg)
return ActionResult(extracted_content=msg, include_in_memory=True)
except Exception as e:
msg = f'Failed to upload file to index {index}: {str(e)}'
logger.info(msg)
return ActionResult(error=msg)
@controller.action('Read the file content of a file given a path')
async def read_file(path: str, available_file_paths: list[str]):
if path not in available_file_paths:
return ActionResult(error=f'File path {path} is not available')
async with await anyio.open_file(path, 'r') as f:
content = await f.read()
msg = f'File content: {content}'
logger.info(msg)
return ActionResult(extracted_content=msg, include_in_memory=True)
def create_file(file_type: str = 'txt'):
with open(f'tmp.{file_type}', 'w') as f:
f.write('test')
file_path = Path.cwd() / f'tmp.{file_type}'
logger.info(f'Created file: {file_path}')
return str(file_path)
async def main():
task = 'Go to https://kzmpmkh2zfk1ojnpxfn1.lite.vusercontent.net/ and - read the file content and upload them to fields'
available_file_paths = [create_file('txt'), create_file('pdf'), create_file('csv')]
model = ChatOpenAI(model='gpt-4o')
agent = Agent(
task=task,
llm=model,
controller=controller,
browser=browser,
available_file_paths=available_file_paths,
)
await agent.run()
await browser.close()
input('Press Enter to close...')
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,97 @@
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from pydantic import BaseModel
from browser_use import Agent, Controller
from browser_use.agent.views import ActionResult
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.browser.context import BrowserContext
class HoverAction(BaseModel):
index: int | None = None
xpath: str | None = None
selector: str | None = None
browser = Browser(
config=BrowserConfig(
headless=False,
)
)
controller = Controller()
@controller.registry.action(
'Hover over an element',
param_model=HoverAction, # Define this model with at least "index: int" field
)
async def hover_element(params: HoverAction, browser: BrowserContext):
"""
Hovers over the element specified by its index from the cached selector map or by XPath.
"""
session = await browser.get_session()
state = session.cached_state
if params.xpath:
# Use XPath to locate the element
element_handle = await browser.get_locate_element_by_xpath(params.xpath)
if element_handle is None:
raise Exception(f'Failed to locate element with XPath {params.xpath}')
elif params.selector:
# Use CSS selector to locate the element
element_handle = await browser.get_locate_element_by_css_selector(params.selector)
if element_handle is None:
raise Exception(f'Failed to locate element with CSS Selector {params.selector}')
elif params.index is not None:
# Use index to locate the element
if state is None or params.index not in state.selector_map:
raise Exception(f'Element index {params.index} does not exist - retry or use alternative actions')
element_node = state.selector_map[params.index]
element_handle = await browser.get_locate_element(element_node)
if element_handle is None:
raise Exception(f'Failed to locate element with index {params.index}')
else:
raise Exception('Either index or xpath must be provided')
try:
await element_handle.hover()
msg = (
f'🖱️ Hovered over element at index {params.index}'
if params.index is not None
else f'🖱️ Hovered over element with XPath {params.xpath}'
)
return ActionResult(extracted_content=msg, include_in_memory=True)
except Exception as e:
err_msg = f'❌ Failed to hover over element: {str(e)}'
raise Exception(err_msg)
async def main():
task = 'Open https://testpages.eviltester.com/styled/csspseudo/css-hover.html and hover the element with the css selector #hoverdivpara, then click on "Can you click me?"'
# task = 'Open https://testpages.eviltester.com/styled/csspseudo/css-hover.html and hover the element with the xpath //*[@id="hoverdivpara"], then click on "Can you click me?"'
model = ChatOpenAI(model='gpt-4o')
agent = Agent(
task=task,
llm=model,
controller=controller,
browser=browser,
)
await agent.run()
await browser.close()
input('Press Enter to close...')
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,45 @@
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from browser_use import ActionResult, Agent, Controller
controller = Controller()
@controller.registry.action('Done with task ')
async def done(text: str):
import yagmail
# To send emails use
# STEP 1: go to https://support.google.com/accounts/answer/185833
# STEP 2: Create an app password (you can't use here your normal gmail password)
# STEP 3: Use the app password in the code below for the password
yag = yagmail.SMTP('your_email@gmail.com', 'your_app_password')
yag.send(
to='recipient@example.com',
subject='Test Email',
contents=f'result\n: {text}',
)
return ActionResult(is_done=True, extracted_content='Email sent!')
async def main():
task = 'go to brower-use.com and then done'
model = ChatOpenAI(model='gpt-4o')
agent = Agent(task=task, llm=model, controller=controller)
await agent.run()
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,57 @@
import asyncio
import logging
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from onepassword.client import Client # pip install onepassword-sdk
from browser_use import ActionResult, Agent, Controller
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
OP_SERVICE_ACCOUNT_TOKEN = os.getenv('OP_SERVICE_ACCOUNT_TOKEN')
OP_ITEM_ID = os.getenv('OP_ITEM_ID') # Go to 1Password, right click on the item, click "Copy Secret Reference"
controller = Controller()
@controller.registry.action('Get 2FA code from 1Password for Google Account', domains=['*.google.com', 'google.com'])
async def get_1password_2fa() -> ActionResult:
"""
Custom action to retrieve 2FA/MFA code from 1Password using onepassword.client SDK.
"""
client = await Client.authenticate(
# setup instructions: https://github.com/1Password/onepassword-sdk-python/#-get-started
auth=OP_SERVICE_ACCOUNT_TOKEN,
integration_name='Browser-Use',
integration_version='v1.0.0',
)
mfa_code = await client.secrets.resolve(f'op://Private/{OP_ITEM_ID}/One-time passcode')
return ActionResult(extracted_content=mfa_code)
async def main():
# Example task using the 1Password 2FA action
task = 'Go to account.google.com, enter username and password, then if prompted for 2FA code, get 2FA code from 1Password for and enter it'
model = ChatOpenAI(model='gpt-4o')
agent = Agent(task=task, llm=model, controller=controller)
result = await agent.run()
print(f'Task completed with result: {result}')
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,50 @@
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from pydantic import BaseModel
from browser_use.agent.service import Agent
from browser_use.controller.service import Controller
# Initialize controller first
controller = Controller()
class Model(BaseModel):
title: str
url: str
likes: int
license: str
class Models(BaseModel):
models: list[Model]
@controller.action('Save models', param_model=Models)
def save_models(params: Models):
with open('models.txt', 'a') as f:
for model in params.models:
f.write(f'{model.title} ({model.url}): {model.likes} likes, {model.license}\n')
# video: https://preview.screen.studio/share/EtOhIk0P
async def main():
task = 'Look up models with a license of cc-by-sa-4.0 and sort by most likes on Hugging face, save top 5 to file.'
model = ChatOpenAI(model='gpt-4o')
agent = Agent(task=task, llm=model, controller=controller)
await agent.run()
if __name__ == '__main__':
asyncio.run(main())