megharudushi commited on
Commit
aa56ff3
·
verified ·
1 Parent(s): 4b25450

Upload browser_agent.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. browser_agent.py +221 -0
browser_agent.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Browser Agent - High-level automation using Playwright
3
+ For agentic browser control with screenshot + action loop
4
+ """
5
+
6
+ import asyncio
7
+ import base64
8
+ import os
9
+ from typing import Optional, List, Dict, Any
10
+ from playwright.async_api import async_playwright, Page, Browser
11
+
12
+ class BrowserAgent:
13
+ """AI-friendly browser automation agent"""
14
+
15
+ def __init__(self):
16
+ self.browser: Optional[Browser] = None
17
+ self.page: Optional[Page] = None
18
+ self.playwright = None
19
+
20
+ async def start(self, headless: bool = False):
21
+ """Launch browser"""
22
+ self.playwright = await async_playwright().start()
23
+ self.browser = await self.playwright.chromium.launch(
24
+ headless=headless,
25
+ args=[
26
+ '--no-sandbox',
27
+ '--disable-dev-shm-usage',
28
+ '--disable-gpu',
29
+ ]
30
+ )
31
+ self.page = await self.browser.new_page(viewport={'width': 1280, 'height': 720})
32
+ return self
33
+
34
+ async def stop(self):
35
+ """Close browser"""
36
+ if self.browser:
37
+ await self.browser.close()
38
+ if self.playwright:
39
+ await self.playwright.stop()
40
+
41
+ # Navigation
42
+ async def goto(self, url: str) -> Dict[str, Any]:
43
+ """Navigate to URL"""
44
+ response = await self.page.goto(url, wait_until='networkidle')
45
+ return {
46
+ 'url': self.page.url,
47
+ 'status': response.status if response else None,
48
+ 'title': await self.page.title()
49
+ }
50
+
51
+ async def back(self):
52
+ """Go back"""
53
+ await self.page.go_back()
54
+ return {'url': self.page.url}
55
+
56
+ async def forward(self):
57
+ """Go forward"""
58
+ await self.page.go_forward()
59
+ return {'url': self.page.url}
60
+
61
+ async def reload(self):
62
+ """Reload page"""
63
+ await self.page.reload()
64
+ return {'url': self.page.url}
65
+
66
+ # Observation
67
+ async def screenshot(self, full_page: bool = False) -> str:
68
+ """Take screenshot, return base64"""
69
+ data = await self.page.screenshot(full_page=full_page)
70
+ return base64.b64encode(data).decode()
71
+
72
+ async def get_page_info(self) -> Dict[str, Any]:
73
+ """Get current page information"""
74
+ return {
75
+ 'url': self.page.url,
76
+ 'title': await self.page.title(),
77
+ 'viewport': self.page.viewport_size
78
+ }
79
+
80
+ async def get_text(self) -> str:
81
+ """Get visible text content"""
82
+ return await self.page.inner_text('body')
83
+
84
+ async def get_html(self) -> str:
85
+ """Get page HTML"""
86
+ return await self.page.content()
87
+
88
+ async def get_elements(self, selector: str) -> List[Dict]:
89
+ """Get element info for selector"""
90
+ elements = await self.page.query_selector_all(selector)
91
+ result = []
92
+ for el in elements:
93
+ box = await el.bounding_box()
94
+ text = await el.inner_text() if await el.is_visible() else ""
95
+ result.append({
96
+ 'text': text[:100],
97
+ 'visible': await el.is_visible(),
98
+ 'box': box
99
+ })
100
+ return result
101
+
102
+ # Actions
103
+ async def click(self, selector: str) -> Dict[str, Any]:
104
+ """Click element"""
105
+ await self.page.click(selector)
106
+ return {'clicked': selector, 'url': self.page.url}
107
+
108
+ async def click_at(self, x: int, y: int) -> Dict[str, Any]:
109
+ """Click at coordinates"""
110
+ await self.page.mouse.click(x, y)
111
+ return {'clicked_at': {'x': x, 'y': y}}
112
+
113
+ async def type(self, selector: str, text: str, clear: bool = True) -> Dict[str, Any]:
114
+ """Type into element"""
115
+ if clear:
116
+ await self.page.fill(selector, text)
117
+ else:
118
+ await self.page.type(selector, text)
119
+ return {'typed': text, 'into': selector}
120
+
121
+ async def press(self, key: str) -> Dict[str, Any]:
122
+ """Press keyboard key"""
123
+ await self.page.keyboard.press(key)
124
+ return {'pressed': key}
125
+
126
+ async def scroll(self, direction: str = 'down', amount: int = 500) -> Dict[str, Any]:
127
+ """Scroll page"""
128
+ delta = amount if direction == 'down' else -amount
129
+ await self.page.mouse.wheel(0, delta)
130
+ return {'scrolled': direction, 'amount': amount}
131
+
132
+ async def hover(self, selector: str) -> Dict[str, Any]:
133
+ """Hover over element"""
134
+ await self.page.hover(selector)
135
+ return {'hovered': selector}
136
+
137
+ async def select(self, selector: str, value: str) -> Dict[str, Any]:
138
+ """Select dropdown option"""
139
+ await self.page.select_option(selector, value)
140
+ return {'selected': value, 'in': selector}
141
+
142
+ async def wait(self, selector: str, timeout: int = 10000) -> Dict[str, Any]:
143
+ """Wait for element"""
144
+ await self.page.wait_for_selector(selector, timeout=timeout)
145
+ return {'found': selector}
146
+
147
+ async def wait_for_navigation(self, timeout: int = 30000):
148
+ """Wait for navigation to complete"""
149
+ await self.page.wait_for_load_state('networkidle', timeout=timeout)
150
+ return {'url': self.page.url}
151
+
152
+ # JavaScript execution
153
+ async def evaluate(self, js: str) -> Any:
154
+ """Execute JavaScript"""
155
+ return await self.page.evaluate(js)
156
+
157
+ # File operations
158
+ async def download(self, selector: str, save_path: str) -> Dict[str, Any]:
159
+ """Click download link and save file"""
160
+ async with self.page.expect_download() as download_info:
161
+ await self.page.click(selector)
162
+ download = await download_info.value
163
+ await download.save_as(save_path)
164
+ return {'saved': save_path}
165
+
166
+ async def upload(self, selector: str, file_path: str) -> Dict[str, Any]:
167
+ """Upload file"""
168
+ await self.page.set_input_files(selector, file_path)
169
+ return {'uploaded': file_path}
170
+
171
+
172
+ # CLI for testing
173
+ async def main():
174
+ agent = BrowserAgent()
175
+ await agent.start(headless=False)
176
+
177
+ print("Browser Agent started. Commands:")
178
+ print(" goto <url> - Navigate to URL")
179
+ print(" click <sel> - Click element")
180
+ print(" type <sel> <t> - Type text")
181
+ print(" screenshot - Take screenshot")
182
+ print(" info - Page info")
183
+ print(" quit - Exit")
184
+
185
+ while True:
186
+ try:
187
+ cmd = input("\n> ").strip().split(maxsplit=2)
188
+ if not cmd:
189
+ continue
190
+
191
+ action = cmd[0].lower()
192
+
193
+ if action == 'quit':
194
+ break
195
+ elif action == 'goto' and len(cmd) > 1:
196
+ result = await agent.goto(cmd[1])
197
+ print(result)
198
+ elif action == 'click' and len(cmd) > 1:
199
+ result = await agent.click(cmd[1])
200
+ print(result)
201
+ elif action == 'type' and len(cmd) > 2:
202
+ result = await agent.type(cmd[1], cmd[2])
203
+ print(result)
204
+ elif action == 'screenshot':
205
+ data = await agent.screenshot()
206
+ with open('/tmp/screenshot.png', 'wb') as f:
207
+ f.write(base64.b64decode(data))
208
+ print("Saved to /tmp/screenshot.png")
209
+ elif action == 'info':
210
+ print(await agent.get_page_info())
211
+ else:
212
+ print(f"Unknown command: {action}")
213
+
214
+ except Exception as e:
215
+ print(f"Error: {e}")
216
+
217
+ await agent.stop()
218
+
219
+
220
+ if __name__ == "__main__":
221
+ asyncio.run(main())