[Add] browser-use and main.py

This commit is contained in:
tv0924@icloud.com 2025-05-18 21:57:54 +09:00
commit 96914d44ac
221 changed files with 30952 additions and 1 deletions

View file

@ -0,0 +1,7 @@
# **User Interfaces of Browser-Use**
| **File Name** | **User Interface** | **Description** | **Example Usage** |
|------------------------|-------------------|-------------------------------------------|-------------------------------------------|
| `command_line.py` | **Terminal** | Parses arguments for command-line execution. | `python command_line.py` |
| `gradio_demo.py` | **Gradio** | Provides a Gradio-based interactive UI. | `python gradio_demo.py` |
| `streamlit_demo.py` | **Streamlit** | Runs a Streamlit-based web interface. | `python -m streamlit run streamlit_demo.py` |

View file

@ -0,0 +1,98 @@
"""
To Use It:
Example 1: Using OpenAI (default), with default task: 'go to reddit and search for posts about browser-use'
python command_line.py
Example 2: Using OpenAI with a Custom Query
python command_line.py --query "go to google and search for browser-use"
Example 3: Using Anthropic's Claude Model with a Custom Query
python command_line.py --query "find latest Python tutorials on Medium" --provider anthropic
"""
import argparse
import asyncio
import os
import sys
# Ensure local repository (browser_use) is accessible
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from browser_use import Agent
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.controller.service import Controller
def get_llm(provider: str):
if provider == 'anthropic':
from langchain_anthropic import ChatAnthropic
api_key = os.getenv('ANTHROPIC_API_KEY')
if not api_key:
raise ValueError('Error: ANTHROPIC_API_KEY is not set. Please provide a valid API key.')
return ChatAnthropic(model_name='claude-3-5-sonnet-20240620', timeout=25, stop=None, temperature=0.0)
elif provider == 'openai':
from langchain_openai import ChatOpenAI
api_key = os.getenv('OPENAI_API_KEY')
if not api_key:
raise ValueError('Error: OPENAI_API_KEY is not set. Please provide a valid API key.')
return ChatOpenAI(model='gpt-4o', temperature=0.0)
else:
raise ValueError(f'Unsupported provider: {provider}')
def parse_arguments():
"""Parse command-line arguments."""
parser = argparse.ArgumentParser(description='Automate browser tasks using an LLM agent.')
parser.add_argument(
'--query', type=str, help='The query to process', default='go to reddit and search for posts about browser-use'
)
parser.add_argument(
'--provider',
type=str,
choices=['openai', 'anthropic'],
default='openai',
help='The model provider to use (default: openai)',
)
return parser.parse_args()
def initialize_agent(query: str, provider: str):
"""Initialize the browser agent with the given query and provider."""
llm = get_llm(provider)
controller = Controller()
browser = Browser(config=BrowserConfig())
return Agent(
task=query,
llm=llm,
controller=controller,
browser=browser,
use_vision=True,
max_actions_per_step=1,
), browser
async def main():
"""Main async function to run the agent."""
args = parse_arguments()
agent, browser = initialize_agent(args.query, args.provider)
await agent.run(max_steps=25)
input('Press Enter to close the browser...')
await browser.close()
if __name__ == '__main__':
asyncio.run(main())

View file

@ -0,0 +1,109 @@
import asyncio
import os
import sys
from dataclasses import dataclass
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
# Third-party imports
import gradio as gr
from langchain_openai import ChatOpenAI
from rich.console import Console
from rich.panel import Panel
from rich.text import Text
# Local module imports
from browser_use import Agent
@dataclass
class ActionResult:
is_done: bool
extracted_content: str | None
error: str | None
include_in_memory: bool
@dataclass
class AgentHistoryList:
all_results: list[ActionResult]
all_model_outputs: list[dict]
def parse_agent_history(history_str: str) -> None:
console = Console()
# Split the content into sections based on ActionResult entries
sections = history_str.split('ActionResult(')
for i, section in enumerate(sections[1:], 1): # Skip first empty section
# Extract relevant information
content = ''
if 'extracted_content=' in section:
content = section.split('extracted_content=')[1].split(',')[0].strip("'")
if content:
header = Text(f'Step {i}', style='bold blue')
panel = Panel(content, title=header, border_style='blue')
console.print(panel)
console.print()
async def run_browser_task(
task: str,
api_key: str,
model: str = 'gpt-4o',
headless: bool = True,
) -> str:
if not api_key.strip():
return 'Please provide an API key'
os.environ['OPENAI_API_KEY'] = api_key
try:
agent = Agent(
task=task,
llm=ChatOpenAI(model='gpt-4o'),
)
result = await agent.run()
# TODO: The result cloud be parsed better
return result
except Exception as e:
return f'Error: {str(e)}'
def create_ui():
with gr.Blocks(title='Browser Use GUI') as interface:
gr.Markdown('# Browser Use Task Automation')
with gr.Row():
with gr.Column():
api_key = gr.Textbox(label='OpenAI API Key', placeholder='sk-...', type='password')
task = gr.Textbox(
label='Task Description',
placeholder='E.g., Find flights from New York to London for next week',
lines=3,
)
model = gr.Dropdown(choices=['gpt-4', 'gpt-3.5-turbo'], label='Model', value='gpt-4')
headless = gr.Checkbox(label='Run Headless', value=True)
submit_btn = gr.Button('Run Task')
with gr.Column():
output = gr.Textbox(label='Output', lines=10, interactive=False)
submit_btn.click(
fn=lambda *args: asyncio.run(run_browser_task(*args)),
inputs=[task, api_key, model, headless],
outputs=output,
)
return interface
if __name__ == '__main__':
demo = create_ui()
demo.launch()

View file

@ -0,0 +1,86 @@
"""
To use it, you'll need to install streamlit, and run with:
python -m streamlit run streamlit_demo.py
"""
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
import streamlit as st
from browser_use import Agent
from browser_use.browser.browser import Browser, BrowserConfig
from browser_use.controller.service import Controller
if os.name == 'nt':
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
# Function to get the LLM based on provider
def get_llm(provider: str):
if provider == 'anthropic':
from langchain_anthropic import ChatAnthropic
api_key = os.getenv('ANTHROPIC_API_KEY')
if not api_key:
st.error('Error: ANTHROPIC_API_KEY is not set. Please provide a valid API key.')
st.stop()
return ChatAnthropic(model_name='claude-3-5-sonnet-20240620', timeout=25, stop=None, temperature=0.0)
elif provider == 'openai':
from langchain_openai import ChatOpenAI
api_key = os.getenv('OPENAI_API_KEY')
if not api_key:
st.error('Error: OPENAI_API_KEY is not set. Please provide a valid API key.')
st.stop()
return ChatOpenAI(model='gpt-4o', temperature=0.0)
else:
st.error(f'Unsupported provider: {provider}')
st.stop()
# Function to initialize the agent
def initialize_agent(query: str, provider: str):
llm = get_llm(provider)
controller = Controller()
browser = Browser(config=BrowserConfig())
return Agent(
task=query,
llm=llm,
controller=controller,
browser=browser,
use_vision=True,
max_actions_per_step=1,
), browser
# Streamlit UI
st.title('Automated Browser Agent with LLMs 🤖')
query = st.text_input('Enter your query:', 'go to reddit and search for posts about browser-use')
provider = st.radio('Select LLM Provider:', ['openai', 'anthropic'], index=0)
if st.button('Run Agent'):
st.write('Initializing agent...')
agent, browser = initialize_agent(query, provider)
async def run_agent():
with st.spinner('Running automation...'):
await agent.run(max_steps=25)
st.success('Task completed! 🎉')
asyncio.run(run_agent())
st.button('Close Browser', on_click=lambda: asyncio.run(browser.close()))