mirror of
https://github.com/j93es/browser-use-oauth.git
synced 2026-06-04 08:11:53 +09:00
여러 url을 연속적으로 실행
This commit is contained in:
parent
ba02e8ea21
commit
a2c88daa74
2 changed files with 164 additions and 88 deletions
47
lib/is_html.py
Normal file
47
lib/is_html.py
Normal file
|
|
@ -0,0 +1,47 @@
|
||||||
|
import requests
|
||||||
|
|
||||||
|
def is_html_url(url: str, timeout: float = 10.0) -> bool:
|
||||||
|
"""
|
||||||
|
주어진 URL에 HEAD 요청을 보내고, 응답 헤더의 Content-Type이 HTML인지 확인합니다.
|
||||||
|
- url: 검사할 URL 문자열
|
||||||
|
- timeout: 요청 타임아웃(초 단위)
|
||||||
|
|
||||||
|
반환값:
|
||||||
|
- Content-Type이 'text/html' 로 시작하면 True, 그렇지 않으면 False
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
# HEAD 요청으로 헤더만 가져와도 충분하지만, 일부 서버에서 HEAD를 허용하지 않을 수 있어
|
||||||
|
# GET 요청을 사용해도 무방합니다. 단, GET은 바디를 가져오기 때문에 HEAD보다 비용이 높을 수 있음.
|
||||||
|
response = requests.head(url, timeout=timeout, allow_redirects=True)
|
||||||
|
|
||||||
|
# 만약 HEAD 요청에 실패하거나 서버가 405(Method Not Allowed)를 반환하면, GET 요청으로 재시도
|
||||||
|
if response.status_code == 405:
|
||||||
|
response = requests.get(url, timeout=timeout, stream=True)
|
||||||
|
|
||||||
|
# 응답 코드가 200번대가 아니면 False로 간주
|
||||||
|
if not response.ok:
|
||||||
|
return False
|
||||||
|
|
||||||
|
content_type = response.headers.get('Content-Type', '')
|
||||||
|
# Content-Type에 'text/html'이 포함되어 있으면 HTML로 간주
|
||||||
|
return content_type.lower().startswith('text/html')
|
||||||
|
|
||||||
|
except requests.RequestException as e:
|
||||||
|
# 네트워크 오류, 타임아웃 등 예외 발생 시 False 반환
|
||||||
|
# 필요하다면 로그를 찍거나 예외를 다시 던질 수 있습니다.
|
||||||
|
print(f"Error fetching URL: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
test_urls = [
|
||||||
|
'https://www.example.com',
|
||||||
|
'https://api.github.com', # JSON API라서 HTML이 아닐 확률이 높음
|
||||||
|
'https://raw.githubusercontent.com' # 텍스트 파일 등 다양한 타입
|
||||||
|
]
|
||||||
|
|
||||||
|
for url in test_urls:
|
||||||
|
if is_html_url(url):
|
||||||
|
print(f"[HTML] {url}")
|
||||||
|
else:
|
||||||
|
print(f"[Not HTML] {url}")
|
||||||
191
main.py
191
main.py
|
|
@ -1,6 +1,7 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import csv
|
||||||
from typing import List
|
from typing import List
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
@ -8,38 +9,18 @@ from langchain_google_genai import ChatGoogleGenerativeAI
|
||||||
from browser_use import Agent, Browser, BrowserConfig, Controller
|
from browser_use import Agent, Browser, BrowserConfig, Controller
|
||||||
from browser_use.browser.context import BrowserContext, BrowserContextConfig
|
from browser_use.browser.context import BrowserContext, BrowserContextConfig
|
||||||
from lib.browser_config import browser_config_kwargs
|
from lib.browser_config import browser_config_kwargs
|
||||||
import csv
|
from lib.is_html import is_html_url
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
# Check environment variables
|
|
||||||
if os.getenv("GOOGLE_API_KEY") is None:
|
if os.getenv("GOOGLE_API_KEY") is None:
|
||||||
raise ValueError("OPENAI_API_KEY environment variable not set.")
|
raise ValueError("GOOGLE_API_KEY 환경변수가 설정되지 않았습니다.")
|
||||||
if os.getenv("GOOGLE_MODEL") is None:
|
if os.getenv("GOOGLE_MODEL") is None:
|
||||||
raise ValueError("OPENAI_MODEL environment variable not set.")
|
raise ValueError("GOOGLE_MODEL 환경변수가 설정되지 않았습니다.")
|
||||||
if os.getenv("GOOGLE_PLANNER_MODEL") is None:
|
if os.getenv("GOOGLE_PLANNER_MODEL") is None:
|
||||||
raise ValueError("OPENAI_PLANNER_MODEL environment variable not set.")
|
raise ValueError("GOOGLE_PLANNER_MODEL 환경변수가 설정되지 않았습니다.")
|
||||||
|
|
||||||
# Configure browser
|
# 출력 모델
|
||||||
browser = Browser(
|
|
||||||
config=BrowserConfig(**browser_config_kwargs())
|
|
||||||
)
|
|
||||||
|
|
||||||
# Set browser context
|
|
||||||
context = BrowserContext(
|
|
||||||
browser=browser,
|
|
||||||
config=BrowserContextConfig(
|
|
||||||
wait_for_network_idle_page_load_time=3.0,
|
|
||||||
window_width=1600,
|
|
||||||
window_height=900,
|
|
||||||
locale='en-US',
|
|
||||||
highlight_elements=True,
|
|
||||||
viewport_expansion=500,
|
|
||||||
keep_alive=True
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Output model: each result is one OAuth entry with metadata
|
|
||||||
class OAuth(BaseModel):
|
class OAuth(BaseModel):
|
||||||
provider: str
|
provider: str
|
||||||
oauth_uri: str
|
oauth_uri: str
|
||||||
|
|
@ -47,51 +28,77 @@ class OAuth(BaseModel):
|
||||||
class OAuthList(BaseModel):
|
class OAuthList(BaseModel):
|
||||||
oauth_providers: List[OAuth]
|
oauth_providers: List[OAuth]
|
||||||
|
|
||||||
controller = Controller(output_model=OAuthList)
|
# Controller는 매번 새로 생성해도 무방합니다.
|
||||||
|
def make_controller():
|
||||||
|
return Controller(output_model=OAuthList)
|
||||||
|
|
||||||
# Extended planner prompt
|
# Extended planner prompt
|
||||||
extend_planner_system_message = """
|
extend_planner_system_message = """
|
||||||
🎯 Your mission is to collect the real OAuth login URLs from the website.
|
🎯 Mission: Collect Initial SSO Redirect URLs (For Browser Automation)
|
||||||
|
|
||||||
1. First, go to the website’s **login page**.
|
1. Locate the Login Page
|
||||||
2. On the login page, look for OAuth login buttons. These usually say things like **"Continue with Google"**, **"Sign in with GitHub"**, etc.
|
- Navigate to the **client (non-enterprise)** login page.
|
||||||
3. ⚠️ **DO NOT collect or include "Passkey"** — it is NOT an OAuth provider.
|
- If a **privacy policy / cookie / consent popup** appears, **dismiss** or **close** it before continuing.
|
||||||
|
|
||||||
---
|
2. On the Login Page
|
||||||
|
- Look for buttons like:
|
||||||
|
- "Continue with Google"
|
||||||
|
- "Sign in with GitHub"
|
||||||
|
- "Login with Naver"
|
||||||
|
- ✅ Only proceed if **you clearly see a real SSO (social login) button**.
|
||||||
|
- ❌ Ignore or exclude:
|
||||||
|
- Buttons with "Passkey"
|
||||||
|
- Username/password fields
|
||||||
|
- Email-based login
|
||||||
|
- Login via certificate or mobile verification
|
||||||
|
- Any non-OAuth login options
|
||||||
|
|
||||||
✅ For EACH OAuth button you find:
|
3. If at least one valid SSO login button is found:
|
||||||
|
- Try to **open it in a new tab**. If that’s not possible, **click it directly**.
|
||||||
|
- Capture the **first URL that the browser is redirected to** include query string. This URL should:
|
||||||
|
✅ Look like: `https://example.com/auth/google`
|
||||||
|
❌ Do NOT collect OAuth provider endpoint like: `https://accounts.google.com/...`
|
||||||
|
- Return the results in the following format:
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"provider": "Google",
|
||||||
|
"oauth_uri": "https://example.com/auth/google?include_all_params=..."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"provider": "GitHub",
|
||||||
|
"oauth_uri": "https://example.com/auth/github?include_all_params=..."
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
- **Try opening it in a new tab**. If it redirects to an OAuth URL (e.g. `https://accounts.google.com/...`, `https://github.com/login/oauth/...`), copy that **exact final URL**.
|
4. If No SSO Login Buttons Are Found or an Error Occurs:
|
||||||
- If it **doesn’t open in a new tab**, **click the button** and wait for the redirect to happen.
|
- ❌ Terminate the process immediately.
|
||||||
- As soon as you see the redirected URL with **client_id**, **redirect_uri**, etc., copy that **entire URL without changing or hiding anything**.
|
- Return an empty list: `[]`
|
||||||
- Then come back to the original tab (if needed) and continue with the next provider.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
💡 **Do not guess** the OAuth URLs — only collect them by actually interacting with the buttons.
|
|
||||||
|
|
||||||
🚫 **Do not redact or mask any part** of the URL, including `client_id`, `redirect_uri`, `state`, or any other parameters. Record them exactly as they appear.
|
|
||||||
|
|
||||||
✅ Return a list of all OAuth providers and their **full raw redirect URLs** in this exact format:
|
|
||||||
|
|
||||||
```json
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"provider": "Google",
|
|
||||||
"oauth_uri": "https://accounts.google.com/o/oauth2/v2/auth?client_id=...&redirect_uri=...&...",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"provider": "GitHub",
|
|
||||||
"oauth_uri": "https://github.com/login/oauth/authorize?client_id=...&redirect_uri=...",
|
|
||||||
}
|
|
||||||
]
|
|
||||||
```
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Main async runner
|
# ── URL별로 Browser를 새로 띄우는 함수 ──
|
||||||
async def main():
|
async def scan_one_url(url: str):
|
||||||
url = "https://git.imnya.ng"
|
# 1) URL이 HTML 페이지인지 확인
|
||||||
|
if not is_html_url(url):
|
||||||
|
print(f"❌ {url} 은(는) HTML이 아닙니다. 스킵합니다.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 2) Browser + Context 생성
|
||||||
|
browser = Browser(config=BrowserConfig(**browser_config_kwargs()))
|
||||||
|
context = BrowserContext(
|
||||||
|
browser=browser,
|
||||||
|
config=BrowserContextConfig(
|
||||||
|
wait_for_network_idle_page_load_time=3.0,
|
||||||
|
window_width=1600,
|
||||||
|
window_height=900,
|
||||||
|
locale='en-US',
|
||||||
|
highlight_elements=True,
|
||||||
|
viewport_expansion=500,
|
||||||
|
keep_alive=False
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# 3) Agent, Controller 생성
|
||||||
|
controller = make_controller()
|
||||||
agent = Agent(
|
agent = Agent(
|
||||||
browser_context=context,
|
browser_context=context,
|
||||||
browser=browser,
|
browser=browser,
|
||||||
|
|
@ -102,26 +109,20 @@ async def main():
|
||||||
extend_planner_system_message=extend_planner_system_message,
|
extend_planner_system_message=extend_planner_system_message,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Run the agent
|
# 4) 실제 스캔 실행
|
||||||
response = await agent.run()
|
response = await agent.run()
|
||||||
final_result = response.final_result()
|
final_result = response.final_result()
|
||||||
if final_result is None:
|
if final_result is None:
|
||||||
raise ValueError("final_result() returned None")
|
raise ValueError("final_result()가 None을 반환했습니다.")
|
||||||
|
|
||||||
data = json.loads(final_result)
|
data = json.loads(final_result)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
oauth_entries: List[OAuth] = [OAuth(**entry) for entry in data["oauth_providers"]]
|
oauth_entries: List[OAuth] = [OAuth(**entry) for entry in data["oauth_providers"]]
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise ValueError(f"Failed to parse result: {e}\nRaw result: {final_result}")
|
raise ValueError(f"결과 파싱 실패: {e}\n원본 결과: {final_result}")
|
||||||
|
|
||||||
|
|
||||||
# Clear terminal
|
|
||||||
#print("\033c", end="")
|
|
||||||
print("-" * 20)
|
|
||||||
|
|
||||||
print(f"Raw result: {final_result}")
|
|
||||||
|
|
||||||
|
# 5) 결과 출력
|
||||||
|
print("-" * 50)
|
||||||
print(f"🔗 Scanned URL: {url}\n")
|
print(f"🔗 Scanned URL: {url}\n")
|
||||||
print("🔐 Detected OAuth Providers and URLs:")
|
print("🔐 Detected OAuth Providers and URLs:")
|
||||||
for entry in oauth_entries:
|
for entry in oauth_entries:
|
||||||
|
|
@ -129,10 +130,10 @@ async def main():
|
||||||
print(f"⚠️ WARNING: {entry.provider} URL may be masked or incomplete:\n{entry.oauth_uri}\n")
|
print(f"⚠️ WARNING: {entry.provider} URL may be masked or incomplete:\n{entry.oauth_uri}\n")
|
||||||
else:
|
else:
|
||||||
print(f"- {entry.provider}: {entry.oauth_uri}")
|
print(f"- {entry.provider}: {entry.oauth_uri}")
|
||||||
|
print("-" * 50)
|
||||||
|
|
||||||
# Save the result to CSV (append mode, so you can continue later)
|
# 6) CSV에 저장 (append)
|
||||||
# 이거 좀 이상한데 나중에 고쳐야 할듯 파일이 수정이 안됨
|
csv_file = "./oauth_providers.csv"
|
||||||
csv_file = "oauth_providers.csv"
|
|
||||||
file_exists = os.path.isfile(csv_file)
|
file_exists = os.path.isfile(csv_file)
|
||||||
with open(csv_file, "a", newline="", encoding="utf-8") as f:
|
with open(csv_file, "a", newline="", encoding="utf-8") as f:
|
||||||
writer = csv.writer(f)
|
writer = csv.writer(f)
|
||||||
|
|
@ -140,13 +141,41 @@ async def main():
|
||||||
writer.writerow(["issuer", "provider", "oauth_uri"])
|
writer.writerow(["issuer", "provider", "oauth_uri"])
|
||||||
for entry in oauth_entries:
|
for entry in oauth_entries:
|
||||||
writer.writerow([url, entry.provider, entry.oauth_uri])
|
writer.writerow([url, entry.provider, entry.oauth_uri])
|
||||||
print(f"\n✅ OAuth providers saved to {csv_file}")
|
print(f"✅ OAuth providers saved to {csv_file}\n")
|
||||||
|
|
||||||
# Save the result to JSON
|
# 7) Agent와 Browser 닫기
|
||||||
with open(f"oauth_providers_{url}.json", "w") as f:
|
await agent.close() # Agent 내부 작업 정리
|
||||||
json.dump(data, f, indent=2)
|
await context.close() # 브라우저 컨텍스트 종료 (탭/세션 닫기)
|
||||||
print(f"✅ OAuth providers saved to oauth_providers_{url}.json")
|
await browser.close() # 실제 브라우저 프로세스 종료
|
||||||
|
|
||||||
|
# ── 인터랙티브 입력 루프 ──
|
||||||
|
async def loop():
|
||||||
|
|
||||||
|
target_list = [
|
||||||
|
# "chefsdinners.com",
|
||||||
|
# "dungeonofdoomkemah.com",
|
||||||
|
# "fertittaentertainmentinc.com",
|
||||||
|
# "galvestonholidayinn.com",
|
||||||
|
# "goldennugget.com",
|
||||||
|
# "hunttinginn.com",
|
||||||
|
# "kemahbeerfest.com",
|
||||||
|
# "lilliesasiancuisine.com",
|
||||||
|
# "muer.com",
|
||||||
|
# "pleasurepier.com",
|
||||||
|
# "r-u-i.com",
|
||||||
|
# "sanluisresort.com",
|
||||||
|
"shoppostoak.com",
|
||||||
|
"thepostoak.com",
|
||||||
|
"thepostoakhotel.com",
|
||||||
|
"tilmanfertitta.com",
|
||||||
|
"wildwoodcasino.net",
|
||||||
|
"accounts.firefox.com",
|
||||||
|
"addons.allizom.org",
|
||||||
|
"api.profiler.firefox.com"]
|
||||||
|
|
||||||
# Run it
|
for url in target_list:
|
||||||
asyncio.run(main())
|
await scan_one_url(f'https://{url}')
|
||||||
|
|
||||||
|
# ── 진입점 ──
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(loop())
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue