From a2c88daa741df8c6981eeb2ae9e2f3c2e5ce14bd Mon Sep 17 00:00:00 2001 From: "tv0924@icloud.com" Date: Mon, 2 Jun 2025 15:09:43 +0900 Subject: [PATCH] =?UTF-8?q?=EC=97=AC=EB=9F=AC=20url=EC=9D=84=20=EC=97=B0?= =?UTF-8?q?=EC=86=8D=EC=A0=81=EC=9C=BC=EB=A1=9C=20=EC=8B=A4=ED=96=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lib/is_html.py | 47 ++++++++++++ main.py | 191 ++++++++++++++++++++++++++++--------------------- 2 files changed, 157 insertions(+), 81 deletions(-) create mode 100644 lib/is_html.py diff --git a/lib/is_html.py b/lib/is_html.py new file mode 100644 index 0000000..fe8dcfd --- /dev/null +++ b/lib/is_html.py @@ -0,0 +1,47 @@ +import requests + +def is_html_url(url: str, timeout: float = 10.0) -> bool: + """ + 주어진 URL에 HEAD 요청을 보내고, 응답 헤더의 Content-Type이 HTML인지 확인합니다. + - url: 검사할 URL 문자열 + - timeout: 요청 타임아웃(초 단위) + + 반환값: + - Content-Type이 'text/html' 로 시작하면 True, 그렇지 않으면 False + """ + + try: + # HEAD 요청으로 헤더만 가져와도 충분하지만, 일부 서버에서 HEAD를 허용하지 않을 수 있어 + # GET 요청을 사용해도 무방합니다. 단, GET은 바디를 가져오기 때문에 HEAD보다 비용이 높을 수 있음. + response = requests.head(url, timeout=timeout, allow_redirects=True) + + # 만약 HEAD 요청에 실패하거나 서버가 405(Method Not Allowed)를 반환하면, GET 요청으로 재시도 + if response.status_code == 405: + response = requests.get(url, timeout=timeout, stream=True) + + # 응답 코드가 200번대가 아니면 False로 간주 + if not response.ok: + return False + + content_type = response.headers.get('Content-Type', '') + # Content-Type에 'text/html'이 포함되어 있으면 HTML로 간주 + return content_type.lower().startswith('text/html') + + except requests.RequestException as e: + # 네트워크 오류, 타임아웃 등 예외 발생 시 False 반환 + # 필요하다면 로그를 찍거나 예외를 다시 던질 수 있습니다. + print(f"Error fetching URL: {e}") + return False + +if __name__ == '__main__': + test_urls = [ + 'https://www.example.com', + 'https://api.github.com', # JSON API라서 HTML이 아닐 확률이 높음 + 'https://raw.githubusercontent.com' # 텍스트 파일 등 다양한 타입 + ] + + for url in test_urls: + if is_html_url(url): + print(f"[HTML] {url}") + else: + print(f"[Not HTML] {url}") diff --git a/main.py b/main.py index b6db495..e1c2178 100644 --- a/main.py +++ b/main.py @@ -1,6 +1,7 @@ import asyncio import json import os +import csv from typing import List from dotenv import load_dotenv from pydantic import BaseModel @@ -8,38 +9,18 @@ from langchain_google_genai import ChatGoogleGenerativeAI from browser_use import Agent, Browser, BrowserConfig, Controller from browser_use.browser.context import BrowserContext, BrowserContextConfig from lib.browser_config import browser_config_kwargs -import csv +from lib.is_html import is_html_url load_dotenv() -# Check environment variables if os.getenv("GOOGLE_API_KEY") is None: - raise ValueError("OPENAI_API_KEY environment variable not set.") + raise ValueError("GOOGLE_API_KEY 환경변수가 설정되지 않았습니다.") if os.getenv("GOOGLE_MODEL") is None: - raise ValueError("OPENAI_MODEL environment variable not set.") + raise ValueError("GOOGLE_MODEL 환경변수가 설정되지 않았습니다.") if os.getenv("GOOGLE_PLANNER_MODEL") is None: - raise ValueError("OPENAI_PLANNER_MODEL environment variable not set.") + raise ValueError("GOOGLE_PLANNER_MODEL 환경변수가 설정되지 않았습니다.") -# Configure browser -browser = Browser( - config=BrowserConfig(**browser_config_kwargs()) -) - -# Set browser context -context = BrowserContext( - browser=browser, - config=BrowserContextConfig( - wait_for_network_idle_page_load_time=3.0, - window_width=1600, - window_height=900, - locale='en-US', - highlight_elements=True, - viewport_expansion=500, - keep_alive=True - ) -) - -# Output model: each result is one OAuth entry with metadata +# 출력 모델 class OAuth(BaseModel): provider: str oauth_uri: str @@ -47,51 +28,77 @@ class OAuth(BaseModel): class OAuthList(BaseModel): oauth_providers: List[OAuth] -controller = Controller(output_model=OAuthList) +# Controller는 매번 새로 생성해도 무방합니다. +def make_controller(): + return Controller(output_model=OAuthList) # Extended planner prompt extend_planner_system_message = """ -🎯 Your mission is to collect the real OAuth login URLs from the website. +🎯 Mission: Collect Initial SSO Redirect URLs (For Browser Automation) -1. First, go to the website’s **login page**. -2. On the login page, look for OAuth login buttons. These usually say things like **"Continue with Google"**, **"Sign in with GitHub"**, etc. -3. ⚠️ **DO NOT collect or include "Passkey"** — it is NOT an OAuth provider. +1. Locate the Login Page +- Navigate to the **client (non-enterprise)** login page. +- If a **privacy policy / cookie / consent popup** appears, **dismiss** or **close** it before continuing. ---- +2. On the Login Page +- Look for buttons like: + - "Continue with Google" + - "Sign in with GitHub" + - "Login with Naver" +- ✅ Only proceed if **you clearly see a real SSO (social login) button**. +- ❌ Ignore or exclude: + - Buttons with "Passkey" + - Username/password fields + - Email-based login + - Login via certificate or mobile verification + - Any non-OAuth login options -✅ For EACH OAuth button you find: +3. If at least one valid SSO login button is found: +- Try to **open it in a new tab**. If that’s not possible, **click it directly**. +- Capture the **first URL that the browser is redirected to** include query string. This URL should: + ✅ Look like: `https://example.com/auth/google` + ❌ Do NOT collect OAuth provider endpoint like: `https://accounts.google.com/...` +- Return the results in the following format: + [ + { + "provider": "Google", + "oauth_uri": "https://example.com/auth/google?include_all_params=..." + }, + { + "provider": "GitHub", + "oauth_uri": "https://example.com/auth/github?include_all_params=..." + } + ] -- **Try opening it in a new tab**. If it redirects to an OAuth URL (e.g. `https://accounts.google.com/...`, `https://github.com/login/oauth/...`), copy that **exact final URL**. -- If it **doesn’t open in a new tab**, **click the button** and wait for the redirect to happen. - - As soon as you see the redirected URL with **client_id**, **redirect_uri**, etc., copy that **entire URL without changing or hiding anything**. - - Then come back to the original tab (if needed) and continue with the next provider. - ---- - -💡 **Do not guess** the OAuth URLs — only collect them by actually interacting with the buttons. - -🚫 **Do not redact or mask any part** of the URL, including `client_id`, `redirect_uri`, `state`, or any other parameters. Record them exactly as they appear. - -✅ Return a list of all OAuth providers and their **full raw redirect URLs** in this exact format: - -```json -[ - { - "provider": "Google", - "oauth_uri": "https://accounts.google.com/o/oauth2/v2/auth?client_id=...&redirect_uri=...&...", - }, - { - "provider": "GitHub", - "oauth_uri": "https://github.com/login/oauth/authorize?client_id=...&redirect_uri=...", - } -] -``` +4. If No SSO Login Buttons Are Found or an Error Occurs: +- ❌ Terminate the process immediately. +- Return an empty list: `[]` """ -# Main async runner -async def main(): - url = "https://git.imnya.ng" +# ── URL별로 Browser를 새로 띄우는 함수 ── +async def scan_one_url(url: str): + # 1) URL이 HTML 페이지인지 확인 + if not is_html_url(url): + print(f"❌ {url} 은(는) HTML이 아닙니다. 스킵합니다.") + return + # 2) Browser + Context 생성 + browser = Browser(config=BrowserConfig(**browser_config_kwargs())) + context = BrowserContext( + browser=browser, + config=BrowserContextConfig( + wait_for_network_idle_page_load_time=3.0, + window_width=1600, + window_height=900, + locale='en-US', + highlight_elements=True, + viewport_expansion=500, + keep_alive=False + ) + ) + + # 3) Agent, Controller 생성 + controller = make_controller() agent = Agent( browser_context=context, browser=browser, @@ -102,26 +109,20 @@ async def main(): extend_planner_system_message=extend_planner_system_message, ) - # Run the agent + # 4) 실제 스캔 실행 response = await agent.run() final_result = response.final_result() if final_result is None: - raise ValueError("final_result() returned None") + raise ValueError("final_result()가 None을 반환했습니다.") data = json.loads(final_result) - try: oauth_entries: List[OAuth] = [OAuth(**entry) for entry in data["oauth_providers"]] except Exception as e: - raise ValueError(f"Failed to parse result: {e}\nRaw result: {final_result}") - - - # Clear terminal - #print("\033c", end="") - print("-" * 20) - - print(f"Raw result: {final_result}") + raise ValueError(f"결과 파싱 실패: {e}\n원본 결과: {final_result}") + # 5) 결과 출력 + print("-" * 50) print(f"🔗 Scanned URL: {url}\n") print("🔐 Detected OAuth Providers and URLs:") for entry in oauth_entries: @@ -129,10 +130,10 @@ async def main(): print(f"⚠️ WARNING: {entry.provider} URL may be masked or incomplete:\n{entry.oauth_uri}\n") else: print(f"- {entry.provider}: {entry.oauth_uri}") + print("-" * 50) - # Save the result to CSV (append mode, so you can continue later) - # 이거 좀 이상한데 나중에 고쳐야 할듯 파일이 수정이 안됨 - csv_file = "oauth_providers.csv" + # 6) CSV에 저장 (append) + csv_file = "./oauth_providers.csv" file_exists = os.path.isfile(csv_file) with open(csv_file, "a", newline="", encoding="utf-8") as f: writer = csv.writer(f) @@ -140,13 +141,41 @@ async def main(): writer.writerow(["issuer", "provider", "oauth_uri"]) for entry in oauth_entries: writer.writerow([url, entry.provider, entry.oauth_uri]) - print(f"\n✅ OAuth providers saved to {csv_file}") + print(f"✅ OAuth providers saved to {csv_file}\n") - # Save the result to JSON - with open(f"oauth_providers_{url}.json", "w") as f: - json.dump(data, f, indent=2) - print(f"✅ OAuth providers saved to oauth_providers_{url}.json") + # 7) Agent와 Browser 닫기 + await agent.close() # Agent 내부 작업 정리 + await context.close() # 브라우저 컨텍스트 종료 (탭/세션 닫기) + await browser.close() # 실제 브라우저 프로세스 종료 +# ── 인터랙티브 입력 루프 ── +async def loop(): + + target_list = [ +# "chefsdinners.com", +# "dungeonofdoomkemah.com", +# "fertittaentertainmentinc.com", +# "galvestonholidayinn.com", +# "goldennugget.com", +# "hunttinginn.com", +# "kemahbeerfest.com", +# "lilliesasiancuisine.com", +# "muer.com", +# "pleasurepier.com", +# "r-u-i.com", +# "sanluisresort.com", +"shoppostoak.com", +"thepostoak.com", +"thepostoakhotel.com", +"tilmanfertitta.com", +"wildwoodcasino.net", +"accounts.firefox.com", +"addons.allizom.org", +"api.profiler.firefox.com"] -# Run it -asyncio.run(main()) + for url in target_list: + await scan_one_url(f'https://{url}') + +# ── 진입점 ── +if __name__ == "__main__": + asyncio.run(loop())