""" Browser Agent - High-level automation using Playwright For agentic browser control with screenshot + action loop """ import asyncio import base64 import os from typing import Optional, List, Dict, Any from playwright.async_api import async_playwright, Page, Browser class BrowserAgent: """AI-friendly browser automation agent""" def __init__(self): self.browser: Optional[Browser] = None self.page: Optional[Page] = None self.playwright = None async def start(self, headless: bool = False): """Launch browser""" self.playwright = await async_playwright().start() self.browser = await self.playwright.chromium.launch( headless=headless, args=[ '--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu', ] ) self.page = await self.browser.new_page(viewport={'width': 1280, 'height': 720}) return self async def stop(self): """Close browser""" if self.browser: await self.browser.close() if self.playwright: await self.playwright.stop() # Navigation async def goto(self, url: str) -> Dict[str, Any]: """Navigate to URL""" response = await self.page.goto(url, wait_until='networkidle') return { 'url': self.page.url, 'status': response.status if response else None, 'title': await self.page.title() } async def back(self): """Go back""" await self.page.go_back() return {'url': self.page.url} async def forward(self): """Go forward""" await self.page.go_forward() return {'url': self.page.url} async def reload(self): """Reload page""" await self.page.reload() return {'url': self.page.url} # Observation async def screenshot(self, full_page: bool = False) -> str: """Take screenshot, return base64""" data = await self.page.screenshot(full_page=full_page) return base64.b64encode(data).decode() async def get_page_info(self) -> Dict[str, Any]: """Get current page information""" return { 'url': self.page.url, 'title': await self.page.title(), 'viewport': self.page.viewport_size } async def get_text(self) -> str: """Get visible text content""" return await self.page.inner_text('body') async def get_html(self) -> str: """Get page HTML""" return await self.page.content() async def get_elements(self, selector: str) -> List[Dict]: """Get element info for selector""" elements = await self.page.query_selector_all(selector) result = [] for el in elements: box = await el.bounding_box() text = await el.inner_text() if await el.is_visible() else "" result.append({ 'text': text[:100], 'visible': await el.is_visible(), 'box': box }) return result # Actions async def click(self, selector: str) -> Dict[str, Any]: """Click element""" await self.page.click(selector) return {'clicked': selector, 'url': self.page.url} async def click_at(self, x: int, y: int) -> Dict[str, Any]: """Click at coordinates""" await self.page.mouse.click(x, y) return {'clicked_at': {'x': x, 'y': y}} async def type(self, selector: str, text: str, clear: bool = True) -> Dict[str, Any]: """Type into element""" if clear: await self.page.fill(selector, text) else: await self.page.type(selector, text) return {'typed': text, 'into': selector} async def press(self, key: str) -> Dict[str, Any]: """Press keyboard key""" await self.page.keyboard.press(key) return {'pressed': key} async def scroll(self, direction: str = 'down', amount: int = 500) -> Dict[str, Any]: """Scroll page""" delta = amount if direction == 'down' else -amount await self.page.mouse.wheel(0, delta) return {'scrolled': direction, 'amount': amount} async def hover(self, selector: str) -> Dict[str, Any]: """Hover over element""" await self.page.hover(selector) return {'hovered': selector} async def select(self, selector: str, value: str) -> Dict[str, Any]: """Select dropdown option""" await self.page.select_option(selector, value) return {'selected': value, 'in': selector} async def wait(self, selector: str, timeout: int = 10000) -> Dict[str, Any]: """Wait for element""" await self.page.wait_for_selector(selector, timeout=timeout) return {'found': selector} async def wait_for_navigation(self, timeout: int = 30000): """Wait for navigation to complete""" await self.page.wait_for_load_state('networkidle', timeout=timeout) return {'url': self.page.url} # JavaScript execution async def evaluate(self, js: str) -> Any: """Execute JavaScript""" return await self.page.evaluate(js) # File operations async def download(self, selector: str, save_path: str) -> Dict[str, Any]: """Click download link and save file""" async with self.page.expect_download() as download_info: await self.page.click(selector) download = await download_info.value await download.save_as(save_path) return {'saved': save_path} async def upload(self, selector: str, file_path: str) -> Dict[str, Any]: """Upload file""" await self.page.set_input_files(selector, file_path) return {'uploaded': file_path} # CLI for testing async def main(): agent = BrowserAgent() await agent.start(headless=False) print("Browser Agent started. Commands:") print(" goto - Navigate to URL") print(" click - Click element") print(" type - Type text") print(" screenshot - Take screenshot") print(" info - Page info") print(" quit - Exit") while True: try: cmd = input("\n> ").strip().split(maxsplit=2) if not cmd: continue action = cmd[0].lower() if action == 'quit': break elif action == 'goto' and len(cmd) > 1: result = await agent.goto(cmd[1]) print(result) elif action == 'click' and len(cmd) > 1: result = await agent.click(cmd[1]) print(result) elif action == 'type' and len(cmd) > 2: result = await agent.type(cmd[1], cmd[2]) print(result) elif action == 'screenshot': data = await agent.screenshot() with open('/tmp/screenshot.png', 'wb') as f: f.write(base64.b64decode(data)) print("Saved to /tmp/screenshot.png") elif action == 'info': print(await agent.get_page_info()) else: print(f"Unknown command: {action}") except Exception as e: print(f"Error: {e}") await agent.stop() if __name__ == "__main__": asyncio.run(main())