[Add] browser-use and main.py

2025-05-18 21:57:54 +09:00 · 2025-05-18 21:57:54 +09:00 · 96914d44ac
commit 96914d44ac
parent 08e64bdf45
221 changed files with 30952 additions and 1 deletions
--- a/browser-use/browser_use/dom/views.py
+++ b/browser-use/browser_use/dom/views.py
@ -0,0 +1,265 @@
+from dataclasses import dataclass
+from functools import cached_property
+from typing import TYPE_CHECKING, Optional
+
+from browser_use.dom.history_tree_processor.view import CoordinateSet, HashedDomElement, ViewportInfo
+from browser_use.utils import time_execution_sync
+
+# Avoid circular import issues
+if TYPE_CHECKING:
+	from .views import DOMElementNode
+
+
+@dataclass(frozen=False)
+class DOMBaseNode:
+	is_visible: bool
+	# Use None as default and set parent later to avoid circular reference issues
+	parent: Optional['DOMElementNode']
+
+	def __json__(self) -> dict:
+		raise NotImplementedError('DOMBaseNode is an abstract class')
+
+
+@dataclass(frozen=False)
+class DOMTextNode(DOMBaseNode):
+	text: str
+	type: str = 'TEXT_NODE'
+
+	def has_parent_with_highlight_index(self) -> bool:
+		current = self.parent
+		while current is not None:
+			# stop if the element has a highlight index (will be handled separately)
+			if current.highlight_index is not None:
+				return True
+
+			current = current.parent
+		return False
+
+	def is_parent_in_viewport(self) -> bool:
+		if self.parent is None:
+			return False
+		return self.parent.is_in_viewport
+
+	def is_parent_top_element(self) -> bool:
+		if self.parent is None:
+			return False
+		return self.parent.is_top_element
+
+	def __json__(self) -> dict:
+		return {
+			'text': self.text,
+			'type': self.type,
+		}
+
+
+@dataclass(frozen=False)
+class DOMElementNode(DOMBaseNode):
+	"""
+	xpath: the xpath of the element from the last root node (shadow root or iframe OR document if no shadow root or iframe).
+	To properly reference the element we need to recursively switch the root node until we find the element (work you way up the tree with `.parent`)
+	"""
+
+	tag_name: str
+	xpath: str
+	attributes: dict[str, str]
+	children: list[DOMBaseNode]
+	is_interactive: bool = False
+	is_top_element: bool = False
+	is_in_viewport: bool = False
+	shadow_root: bool = False
+	highlight_index: int | None = None
+	viewport_coordinates: CoordinateSet | None = None
+	page_coordinates: CoordinateSet | None = None
+	viewport_info: ViewportInfo | None = None
+
+	"""
+	### State injected by the browser context.
+
+	The idea is that the clickable elements are sometimes persistent from the previous page -> tells the model which objects are new/_how_ the state has changed
+	"""
+	is_new: bool | None = None
+
+	def __json__(self) -> dict:
+		return {
+			'tag_name': self.tag_name,
+			'xpath': self.xpath,
+			'attributes': self.attributes,
+			'is_visible': self.is_visible,
+			'is_interactive': self.is_interactive,
+			'is_top_element': self.is_top_element,
+			'is_in_viewport': self.is_in_viewport,
+			'shadow_root': self.shadow_root,
+			'highlight_index': self.highlight_index,
+			'viewport_coordinates': self.viewport_coordinates,
+			'page_coordinates': self.page_coordinates,
+			'children': [child.__json__() for child in self.children],
+		}
+
+	def __repr__(self) -> str:
+		tag_str = f'<{self.tag_name}'
+
+		# Add attributes
+		for key, value in self.attributes.items():
+			tag_str += f' {key}="{value}"'
+		tag_str += '>'
+
+		# Add extra info
+		extras = []
+		if self.is_interactive:
+			extras.append('interactive')
+		if self.is_top_element:
+			extras.append('top')
+		if self.shadow_root:
+			extras.append('shadow-root')
+		if self.highlight_index is not None:
+			extras.append(f'highlight:{self.highlight_index}')
+		if self.is_in_viewport:
+			extras.append('in-viewport')
+
+		if extras:
+			tag_str += f' [{", ".join(extras)}]'
+
+		return tag_str
+
+	@cached_property
+	def hash(self) -> HashedDomElement:
+		from browser_use.dom.history_tree_processor.service import (
+			HistoryTreeProcessor,
+		)
+
+		return HistoryTreeProcessor._hash_dom_element(self)
+
+	def get_all_text_till_next_clickable_element(self, max_depth: int = -1) -> str:
+		text_parts = []
+
+		def collect_text(node: DOMBaseNode, current_depth: int) -> None:
+			if max_depth != -1 and current_depth > max_depth:
+				return
+
+			# Skip this branch if we hit a highlighted element (except for the current node)
+			if isinstance(node, DOMElementNode) and node != self and node.highlight_index is not None:
+				return
+
+			if isinstance(node, DOMTextNode):
+				text_parts.append(node.text)
+			elif isinstance(node, DOMElementNode):
+				for child in node.children:
+					collect_text(child, current_depth + 1)
+
+		collect_text(self, 0)
+		return '\n'.join(text_parts).strip()
+
+	@time_execution_sync('--clickable_elements_to_string')
+	def clickable_elements_to_string(self, include_attributes: list[str] | None = None) -> str:
+		"""Convert the processed DOM content to HTML."""
+		formatted_text = []
+
+		def process_node(node: DOMBaseNode, depth: int) -> None:
+			next_depth = int(depth)
+			depth_str = depth * '\t'
+
+			if isinstance(node, DOMElementNode):
+				# Add element with highlight_index
+				if node.highlight_index is not None:
+					next_depth += 1
+
+					text = node.get_all_text_till_next_clickable_element()
+					attributes_html_str = ''
+					if include_attributes:
+						attributes_to_include = {
+							key: str(value) for key, value in node.attributes.items() if key in include_attributes
+						}
+
+						# Easy LLM optimizations
+						# if tag == role attribute, don't include it
+						if node.tag_name == attributes_to_include.get('role'):
+							del attributes_to_include['role']
+
+						# if aria-label == text of the node, don't include it
+						if (
+							attributes_to_include.get('aria-label')
+							and attributes_to_include.get('aria-label', '').strip() == text.strip()
+						):
+							del attributes_to_include['aria-label']
+
+						# if placeholder == text of the node, don't include it
+						if (
+							attributes_to_include.get('placeholder')
+							and attributes_to_include.get('placeholder', '').strip() == text.strip()
+						):
+							del attributes_to_include['placeholder']
+
+						if attributes_to_include:
+							# Format as key1='value1' key2='value2'
+							attributes_html_str = ' '.join(f"{key}='{value}'" for key, value in attributes_to_include.items())
+
+					# Build the line
+					if node.is_new:
+						highlight_indicator = f'*[{node.highlight_index}]*'
+					else:
+						highlight_indicator = f'[{node.highlight_index}]'
+
+					line = f'{depth_str}{highlight_indicator}<{node.tag_name}'
+
+					if attributes_html_str:
+						line += f' {attributes_html_str}'
+
+					if text:
+						# Add space before >text only if there were NO attributes added before
+						if not attributes_html_str:
+							line += ' '
+						line += f'>{text}'
+					# Add space before /> only if neither attributes NOR text were added
+					elif not attributes_html_str:
+						line += ' '
+
+					line += ' />'  # 1 token
+					formatted_text.append(line)
+
+				# Process children regardless
+				for child in node.children:
+					process_node(child, next_depth)
+
+			elif isinstance(node, DOMTextNode):
+				# Add text only if it doesn't have a highlighted parent
+				if (
+					not node.has_parent_with_highlight_index()
+					and node.parent
+					and node.parent.is_visible
+					and node.parent.is_top_element
+				):  # and node.is_parent_top_element()
+					formatted_text.append(f'{depth_str}{node.text}')
+
+		process_node(self, 0)
+		return '\n'.join(formatted_text)
+
+	def get_file_upload_element(self, check_siblings: bool = True) -> Optional['DOMElementNode']:
+		# Check if current element is a file input
+		if self.tag_name == 'input' and self.attributes.get('type') == 'file':
+			return self
+
+		# Check children
+		for child in self.children:
+			if isinstance(child, DOMElementNode):
+				result = child.get_file_upload_element(check_siblings=False)
+				if result:
+					return result
+
+		# Check siblings only for the initial call
+		if check_siblings and self.parent:
+			for sibling in self.parent.children:
+				if sibling is not self and isinstance(sibling, DOMElementNode):
+					result = sibling.get_file_upload_element(check_siblings=False)
+					if result:
+						return result
+
+		return None
+
+
+SelectorMap = dict[int, DOMElementNode]
+
+
+@dataclass
+class DOMState:
+	element_tree: DOMElementNode
+	selector_map: SelectorMap