Spaces:

megharudushi
/

vnc-browser-agent

Paused

App Files Files Community

megharudushi commited on 11 days ago

Commit

aa56ff3

verified ·

1 Parent(s): 4b25450

Upload browser_agent.py with huggingface_hub

Browse files

Files changed (1) hide show

browser_agent.py +221 -0

browser_agent.py ADDED Viewed

	@@ -0,0 +1,221 @@

+"""
+Browser Agent - High-level automation using Playwright
+For agentic browser control with screenshot + action loop
+"""
+import asyncio
+import base64
+import os
+from typing import Optional, List, Dict, Any
+from playwright.async_api import async_playwright, Page, Browser
+class BrowserAgent:
+    """AI-friendly browser automation agent"""
+    def __init__(self):
+        self.browser: Optional[Browser] = None
+        self.page: Optional[Page] = None
+        self.playwright = None
+    async def start(self, headless: bool = False):
+        """Launch browser"""
+        self.playwright = await async_playwright().start()
+        self.browser = await self.playwright.chromium.launch(
+            headless=headless,
+            args=[
+                '--no-sandbox',
+                '--disable-dev-shm-usage',
+                '--disable-gpu',
+            ]
+        )
+        self.page = await self.browser.new_page(viewport={'width': 1280, 'height': 720})
+        return self
+    async def stop(self):
+        """Close browser"""
+        if self.browser:
+            await self.browser.close()
+        if self.playwright:
+            await self.playwright.stop()
+    # Navigation
+    async def goto(self, url: str) -> Dict[str, Any]:
+        """Navigate to URL"""
+        response = await self.page.goto(url, wait_until='networkidle')
+        return {
+            'url': self.page.url,
+            'status': response.status if response else None,
+            'title': await self.page.title()
+        }
+    async def back(self):
+        """Go back"""
+        await self.page.go_back()
+        return {'url': self.page.url}
+    async def forward(self):
+        """Go forward"""
+        await self.page.go_forward()
+        return {'url': self.page.url}
+    async def reload(self):
+        """Reload page"""
+        await self.page.reload()
+        return {'url': self.page.url}
+    # Observation
+    async def screenshot(self, full_page: bool = False) -> str:
+        """Take screenshot, return base64"""
+        data = await self.page.screenshot(full_page=full_page)
+        return base64.b64encode(data).decode()
+    async def get_page_info(self) -> Dict[str, Any]:
+        """Get current page information"""
+        return {
+            'url': self.page.url,
+            'title': await self.page.title(),
+            'viewport': self.page.viewport_size
+        }
+    async def get_text(self) -> str:
+        """Get visible text content"""
+        return await self.page.inner_text('body')
+    async def get_html(self) -> str:
+        """Get page HTML"""
+        return await self.page.content()
+    async def get_elements(self, selector: str) -> List[Dict]:
+        """Get element info for selector"""
+        elements = await self.page.query_selector_all(selector)
+        result = []
+        for el in elements:
+            box = await el.bounding_box()
+            text = await el.inner_text() if await el.is_visible() else ""
+            result.append({
+                'text': text[:100],
+                'visible': await el.is_visible(),
+                'box': box
+            })
+        return result
+    # Actions
+    async def click(self, selector: str) -> Dict[str, Any]:
+        """Click element"""
+        await self.page.click(selector)
+        return {'clicked': selector, 'url': self.page.url}
+    async def click_at(self, x: int, y: int) -> Dict[str, Any]:
+        """Click at coordinates"""
+        await self.page.mouse.click(x, y)
+        return {'clicked_at': {'x': x, 'y': y}}
+    async def type(self, selector: str, text: str, clear: bool = True) -> Dict[str, Any]:
+        """Type into element"""
+        if clear:
+            await self.page.fill(selector, text)
+        else:
+            await self.page.type(selector, text)
+        return {'typed': text, 'into': selector}
+    async def press(self, key: str) -> Dict[str, Any]:
+        """Press keyboard key"""
+        await self.page.keyboard.press(key)
+        return {'pressed': key}
+    async def scroll(self, direction: str = 'down', amount: int = 500) -> Dict[str, Any]:
+        """Scroll page"""
+        delta = amount if direction == 'down' else -amount
+        await self.page.mouse.wheel(0, delta)
+        return {'scrolled': direction, 'amount': amount}
+    async def hover(self, selector: str) -> Dict[str, Any]:
+        """Hover over element"""
+        await self.page.hover(selector)
+        return {'hovered': selector}
+    async def select(self, selector: str, value: str) -> Dict[str, Any]:
+        """Select dropdown option"""
+        await self.page.select_option(selector, value)
+        return {'selected': value, 'in': selector}
+    async def wait(self, selector: str, timeout: int = 10000) -> Dict[str, Any]:
+        """Wait for element"""
+        await self.page.wait_for_selector(selector, timeout=timeout)
+        return {'found': selector}
+    async def wait_for_navigation(self, timeout: int = 30000):
+        """Wait for navigation to complete"""
+        await self.page.wait_for_load_state('networkidle', timeout=timeout)
+        return {'url': self.page.url}
+    # JavaScript execution
+    async def evaluate(self, js: str) -> Any:
+        """Execute JavaScript"""
+        return await self.page.evaluate(js)
+    # File operations
+    async def download(self, selector: str, save_path: str) -> Dict[str, Any]:
+        """Click download link and save file"""
+        async with self.page.expect_download() as download_info:
+            await self.page.click(selector)
+        download = await download_info.value
+        await download.save_as(save_path)
+        return {'saved': save_path}
+    async def upload(self, selector: str, file_path: str) -> Dict[str, Any]:
+        """Upload file"""
+        await self.page.set_input_files(selector, file_path)
+        return {'uploaded': file_path}
+# CLI for testing
+async def main():
+    agent = BrowserAgent()
+    await agent.start(headless=False)
+    print("Browser Agent started. Commands:")
+    print("  goto <url>     - Navigate to URL")
+    print("  click <sel>    - Click element")
+    print("  type <sel> <t> - Type text")
+    print("  screenshot     - Take screenshot")
+    print("  info           - Page info")
+    print("  quit           - Exit")
+    while True:
+        try:
+            cmd = input("\n> ").strip().split(maxsplit=2)
+            if not cmd:
+                continue
+            action = cmd[0].lower()
+            if action == 'quit':
+                break
+            elif action == 'goto' and len(cmd) > 1:
+                result = await agent.goto(cmd[1])
+                print(result)
+            elif action == 'click' and len(cmd) > 1:
+                result = await agent.click(cmd[1])
+                print(result)
+            elif action == 'type' and len(cmd) > 2:
+                result = await agent.type(cmd[1], cmd[2])
+                print(result)
+            elif action == 'screenshot':
+                data = await agent.screenshot()
+                with open('/tmp/screenshot.png', 'wb') as f:
+                    f.write(base64.b64decode(data))
+                print("Saved to /tmp/screenshot.png")
+            elif action == 'info':
+                print(await agent.get_page_info())
+            else:
+                print(f"Unknown command: {action}")
+        except Exception as e:
+            print(f"Error: {e}")
+    await agent.stop()
+if __name__ == "__main__":
+    asyncio.run(main())