Merge pull request #9 from j93es/j93es2

에러 발생 관련 로직 개선
2026-07-29 10:43:04 +09:00 · 2025-06-09 13:10:50 +09:00 · 2025-06-09 13:10:50 +09:00 · 09a91f9c7c
commit 09a91f9c7c
parent 1b0a0deeb4 94ca383b82
6 changed files with 280 additions and 34535 deletions
--- a/.gitignore
+++ b/.gitignore
@ -15,3 +15,72 @@ oauth_providers.csv
 log_*.log

 domains.txt
+
+# Created by https://www.toptal.com/developers/gitignore/api/macos,windows
+# Edit at https://www.toptal.com/developers/gitignore?templates=macos,windows
+
+### macOS ###
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+### macOS Patch ###
+# iCloud generated files
+*.icloud
+
+### Windows ###
+# Windows thumbnail cache files
+Thumbs.db
+Thumbs.db:encryptable
+ehthumbs.db
+ehthumbs_vista.db
+
+# Dump file
+*.stackdump
+
+# Folder config file
+[Dd]esktop.ini
+
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+
+# Windows Installer files
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+
+# Windows shortcuts
+*.lnk
+
+my.sh
+
+log.txt
+data/
+
+# End of https://www.toptal.com/developers/gitignore/api/macos,windows
--- a/is-html-fast/domains.txt
+++ b/is-html-fast/domains.txt
--- a/lib/logger.py
+++ b/lib/logger.py
@ -0,0 +1,29 @@
+from pathlib import Path
+from datetime import datetime
+
+# 미리 정해진 파일 경로
+FILE_PATH = Path("data/log.txt")
+
+def logger(msg: str) -> None:
+    try:
+        """
+        msg 문자열을 파일 끝에 추가합니다.
+        - 파일이 없으면 새로 생성
+        - 디렉터리가 없으면 생성
+        """
+        # 상위 디렉터리 생성 (이미 있으면 무시)
+        FILE_PATH.parent.mkdir(parents=True, exist_ok=True)
+        
+        # 현재 시각 구해서 포맷팅
+        now = datetime.now()
+        timestamp = now.strftime("%Y-%m-%d %H:%M:%S")
+
+        # 메시지에 개행이 없으면 자동으로 붙이기
+        newline = "" if msg.endswith("\n") else "\n"
+        line = f"[{timestamp}] {msg}{newline}"
+
+        # 'a' 모드: 파일이 없으면 생성, 있으면 이어쓰기
+        with FILE_PATH.open(mode="a", encoding="utf-8") as f:
+            f.write(line)
+    except:
+        print(msg)
--- a/lib/prompt.py
+++ b/lib/prompt.py
@ -0,0 +1,71 @@
+# Extended planner prompt
+extend_planner_system_message = """
+🎯 Mission: Collect Initial SSO Redirect URLs (For Browser Automation)
+
+※ **모든 STEP에서 구글 검색, Bing 검색 등 어떤 외부 검색 기능도 절대 사용하지 않고, 초기에 주어진 URL에서 탐색하세요.**
+
+0. **초기 블록(Block) 체크**
+   - 브라우저가 로그인 페이지에 접근하려 할 때, **페이지가 차단(blocked)** 되거나 **방화벽, CAPTCHA, 접근 제한** 등으로 인해 정상적으로 로드되지 않으면 즉시 프로세스를 종료하고 아래 JSON만 반환해야 합니다.  
+     ```json
+     [
+       {
+         "provider": "Blocked",
+         "oauth_uri": "-"
+       }
+     ]
+     ```
+   - 이후 단계로 절대 넘어가지 않도록 합니다.
+
+1. **로그인 페이지 탐색**
+   - **클라이언트(비엔터프라이즈) 로그인 페이지**로 직접 이동합니다. **검색 엔진을 사용하여 찾아서는 안 됩니다.**
+   - 접근 후 **개인정보/쿠키/동의 팝업**이 뜨면, 이를 반드시 **닫거나(Dismiss)** 처리하고 계속 진행합니다.
+   - (이미 0단계에서 블록 여부를 확인했으므로, 이 단계에서는 페이지가 정상 로드되었다고 가정합니다.)
+
+2. **SSO 버튼 식별**
+   - 로그인 페이지에서 다음과 같은 소셜 로그인 버튼을 찾습니다:
+     - “Continue with Google”
+     - “Sign in with GitHub”
+     - “Login with Naver”
+   - ✅ **실제 SSO 버튼**임이 명확히 확인되는 경우에만 진행합니다.
+   - ❌ 제외 대상:
+     - “Passkey” 관련 버튼
+     - 아이디/비밀번호 입력란
+     - 이메일 기반 로그인
+     - 인증서, 휴대폰 인증 등 비-OAuth 로그인 옵션
+
+3. **리디렉션 URL 캡처**
+   - 유효한 SSO 버튼을 하나 이상 찾았다면, 각각의 버튼을 **새 탭으로 열기**를 시도하거나, 불가능할 경우 **직접 클릭**합니다.
+   - 클릭 후 첫 번째로 **리디렉션된 URL(쿼리 스트링 포함)**을 캡처합니다. 이 URL은:
+     - ✅ 예시: `https://example.com/auth/google?include_all_params=...`
+     - ❌ **OAuth 공급자 자체 엔드포인트** (예: `https://accounts.google.com/...`)는 수집하지 않습니다.
+   - 만약 **반복 행동(looping)**이 감지될 경우(예: 동일한 버튼을 여러 번 열거나 페이지 간 반복 이동), 즉시 프로세스를 종료하고 **빈 배열**을 반환합니다:
+     ```json
+     []
+     ```
+   - 정상적으로 리디렉션 URL을 획득했다면, 아래 형식으로 결과를 수집합니다:
+     ```json
+     [
+       {
+         "provider": "Google",
+         "oauth_uri": "https://example.com/auth/google?include_all_params=..."
+       },
+       {
+         "provider": "GitHub",
+         "oauth_uri": "https://example.com/auth/github?include_all_params=..."
+       }
+     ]
+     ```
+
+4. **SSO 버튼 미발견 또는 오류 발생 시**
+   - 페이지 내부에 유효한 SSO 버튼이 전혀 없거나, 탐색 중 예기치 않은 오류가 발생하면 즉시 프로세스를 종료하고 **빈 배열**을 반환합니다:
+     ```json
+     []
+     ```
+
+5. **중요 사항**
+    - **반드시** 위의 단계들을 순서대로 수행해야 하며, 각 단계에서 발생하는 예외 상황을 정확히 처리해야 합니다.
+    - **반복 행동**이 감지되면 즉시 빈 배열을 반환하고, **블록된 페이지**는 초기 단계에서 처리하여 프로세스를 종료해야 합니다.
+    - **SSO 버튼이 발견되지 않거나, 오류가 발생한 경우에도 빈 배열을 반환해야 합니다.**
+    - **반드시** JSON 형식으로 결과를 반환해야 하며, 다른 형식은 허용되지 않습니다.
+    - 최대한 효율적인 단계로 진행하며, 불필요한 반복이나 검색 엔진 사용을 피해야 합니다.
+"""
--- a/main.py
+++ b/main.py
@ -13,6 +13,8 @@ from browser_use.browser.context import BrowserContext, BrowserContextConfig
 from lib.browser_config import browser_config_kwargs
 from lib.is_html import is_html_url
 from lib.read_txt import read_lines_between
+from lib.prompt import extend_planner_system_message
+from lib.logger import logger

 load_dotenv()

@ -33,92 +35,32 @@ class OAuth(BaseModel):
 class OAuthList(BaseModel):
    oauth_providers: List[OAuth]
    
-# Controller는 매번 새로 생성해도 무방합니다.
-def make_controller():
-    return Controller(output_model=OAuthList)
+async def clean_resources(agent, context, browser):
+    """리소스를 정리하는 함수"""
+    try:
+        await agent.close()
+    except Exception as e:
+        print(f"⚠️ 에이전트 리소스 정리 실패: {e}")
+    try:
+        await context.close()
+    except Exception as e:
+        print(f"⚠️ 컨텍스트 리소스 정리 실패: {e}")
+    try:
+        await browser.close()
+    except Exception as e:
+        print(f"⚠️ 브라우저 리소스 정리 실패: {e}")
    
-# Extended planner prompt
-extend_planner_system_message = """
-🎯 Mission: Collect Initial SSO Redirect URLs (For Browser Automation)
-
-※ **절대로 구글 검색, Bing 검색 등 어떤 외부 검색 기능도 사용하지 말고, 주어진 로그인 페이지 URL을 직접 방문하여 탐색하세요.**
-
-0. **초기 블록(Block) 체크**
-   - 브라우저가 로그인 페이지에 접근하려 할 때, **페이지가 차단(blocked)** 되거나 **방화벽, CAPTCHA, 접근 제한** 등으로 인해 정상적으로 로드되지 않으면 즉시 프로세스를 종료하고 아래 JSON만 반환해야 합니다.  
-     ```json
-     [
-       {
-         "provider": "Blocked",
-         "oauth_uri": "-"
-       }
-     ]
-     ```
-   - 이후 단계로 절대 넘어가지 않도록 합니다.
-
-1. **로그인 페이지 탐색**
-   - **클라이언트(비엔터프라이즈) 로그인 페이지**로 직접 이동합니다. (검색 엔진을 사용하여 찾아서는 안 됩니다.)
-   - 접근 후 **개인정보/쿠키/동의 팝업**이 뜨면, 이를 반드시 **닫거나(Dismiss)** 처리하고 계속 진행합니다.
-   - (이미 0단계에서 블록 여부를 확인했으므로, 이 단계에서는 페이지가 정상 로드되었다고 가정합니다.)
-
-2. **SSO 버튼 식별**
-   - 로그인 페이지에서 다음과 같은 소셜 로그인 버튼을 찾습니다:
-     - “Continue with Google”
-     - “Sign in with GitHub”
-     - “Login with Naver”
-   - ✅ **실제 SSO 버튼**임이 명확히 확인되는 경우에만 진행합니다.
-   - ❌ 제외 대상:
-     - “Passkey” 관련 버튼
-     - 아이디/비밀번호 입력란
-     - 이메일 기반 로그인
-     - 인증서, 휴대폰 인증 등 비-OAuth 로그인 옵션
-
-3. **리디렉션 URL 캡처**
-   - 유효한 SSO 버튼을 하나 이상 찾았다면, 각각의 버튼을 **새 탭으로 열기**를 시도하거나, 불가능할 경우 **직접 클릭**합니다.
-   - 클릭 후 첫 번째로 **리디렉션된 URL(쿼리 스트링 포함)**을 캡처합니다. 이 URL은:
-     - ✅ 예시: `https://example.com/auth/google?include_all_params=...`
-     - ❌ **OAuth 공급자 자체 엔드포인트** (예: `https://accounts.google.com/...`)는 수집하지 않습니다.
-   - 만약 **반복 행동(looping)**이 감지될 경우(예: 동일한 버튼을 여러 번 열거나 페이지 간 반복 이동), 즉시 프로세스를 종료하고 **빈 배열**을 반환합니다:
-     ```json
-     []
-     ```
-   - 정상적으로 리디렉션 URL을 획득했다면, 아래 형식으로 결과를 수집합니다:
-     ```json
-     [
-       {
-         "provider": "Google",
-         "oauth_uri": "https://example.com/auth/google?include_all_params=..."
-       },
-       {
-         "provider": "GitHub",
-         "oauth_uri": "https://example.com/auth/github?include_all_params=..."
-       }
-     ]
-     ```
-
-4. **SSO 버튼 미발견 또는 오류 발생 시**
-   - 페이지 내부에 유효한 SSO 버튼이 전혀 없거나, 탐색 중 예기치 않은 오류가 발생하면 즉시 프로세스를 종료하고 **빈 배열**을 반환합니다:
-     ```json
-     []
-     ```
-
-5. **중요 사항**
-    - **반드시** 위의 단계들을 순서대로 수행해야 하며, 각 단계에서 발생하는 예외 상황을 정확히 처리해야 합니다.
-    - **반복 행동**이 감지되면 즉시 빈 배열을 반환하고, **블록된 페이지**는 초기 단계에서 처리하여 프로세스를 종료해야 합니다.
-    - **SSO 버튼이 발견되지 않거나, 오류가 발생한 경우에도 빈 배열을 반환해야 합니다.**
-    - **반드시** JSON 형식으로 결과를 반환해야 하며, 다른 형식은 허용되지 않습니다.
-    - 최대한 효율적인 단계로 진행하며, 불필요한 반복이나 검색 엔진 사용을 피해야 합니다.
-"""

 # ── URL별로 Browser를 새로 띄우는 함수 ──
 async def scan_one_url(url: str, skip_html_check: bool = False):
-    # 1) URL이 HTML 페이지인지 확인
-    if not is_html_url(url) and not skip_html_check:
-        print(f"❌ {url} 은(는) HTML이 아닙니다. 스킵합니다.")
-        return
-        
    target_url = url if url.startswith("http") else f"https://{url}"
    print(f"🚀 Starting scan for: {target_url}")
    
+    # 1) URL이 HTML 페이지인지 확인
+    if not is_html_url(target_url) and not skip_html_check:
+        print(f"❌ {target_url} 은(는) HTML이 아닙니다. 스킵합니다.")
+        return
+
    # Backend에 스캔 시작을 알림
    try:
        response = requests.post(f"{backend_url}/start", params={"url": target_url}, timeout=5)
@ -133,86 +75,96 @@ async def scan_one_url(url: str, skip_html_check: bool = False):
    except Exception as e:
        print(f"⚠️ Failed to notify backend: {e}")
        
-    # 2) Browser + Context 생성
-    browser = Browser(config=BrowserConfig(**browser_config_kwargs()))
-    context = BrowserContext(
-        browser=browser,
-        config=BrowserContextConfig(
-            wait_for_network_idle_page_load_time=3.0,
-            window_width=1600,
-            window_height=900,
-            locale='en-US',
-            highlight_elements=True,
-            viewport_expansion=500,
-            keep_alive=False
+    try_cnt = 0
+    while True:
+        # 2) Browser + Context 생성
+        browser = Browser(config=BrowserConfig(**browser_config_kwargs()))
+        context = BrowserContext(
+            browser=browser,
+            config=BrowserContextConfig(
+                wait_for_network_idle_page_load_time=3.0,
+                window_width=1600,
+                window_height=900,
+                locale='en-US',
+                highlight_elements=True,
+                viewport_expansion=500,
+                keep_alive=False
+            )
        )
-    )

-    # 3) Agent, Controller 생성
+        # 3) Agent, Controller 생성
+        initial_actions = [
+            {'open_tab': {'url': target_url}},
+        ]

-    initial_actions = [
-        {'open_tab': {'url': url}}
-    ]
+        controller = Controller(output_model=OAuthList)
+        agent = Agent(
+            browser_context=context,
+            browser=browser,
+            initial_actions=initial_actions,
+            task=f"Navigate to the login page, and collect the OAuth provider buttons and their login URLs. Ignore Passkey.",
+            llm=ChatGoogleGenerativeAI(model=os.getenv("GOOGLE_MODEL")),
+            planner_llm=ChatGoogleGenerativeAI(model=os.getenv("GOOGLE_PLANNER_MODEL")),
+            controller=controller,
+            extend_planner_system_message=extend_planner_system_message,
+            retry_delay=60,
+        )

-    controller = make_controller()
-    agent = Agent(
-        browser_context=context,
-        browser=browser,
-        initial_actions=initial_actions,
-        task=f"Navigate to the login page, and collect the OAuth provider buttons and their login URLs. Ignore Passkey.",
-        llm=ChatGoogleGenerativeAI(model=os.getenv("GOOGLE_MODEL")),
-        planner_llm=ChatGoogleGenerativeAI(model=os.getenv("GOOGLE_PLANNER_MODEL")),
-        controller=controller,
-        extend_planner_system_message=extend_planner_system_message,
-        retry_delay=60,
-    )
-    
-    try:
-        # 4) 실제 스캔 실행
-        response = await agent.run()
-        final_result = response.final_result()
-        if final_result is None:
-            raise ValueError("final_result()가 None을 반환했습니다.")
-
-        data = json.loads(final_result)
        try:
-            oauth_entries: List[OAuth] = [OAuth(**entry) for entry in data["oauth_providers"]]
-        except Exception as e:
-            raise ValueError(f"결과 파싱 실패: {e}\n원본 결과: {final_result}")
+            # 4) 실제 스캔 실행
+            response = await agent.run()
+            final_result = response.final_result()
+            if final_result is None:
+                raise ValueError("final_result()가 None을 반환했습니다.")

-        # 5) 결과 출력
-        print("-" * 50)
-        print(f"🔗 Scanned URL: {url}\n")
-        print("🔐 Detected OAuth Providers and URLs:")
-        for entry in oauth_entries:
-            if "<" in entry.oauth_uri or "..." in entry.oauth_uri:
-                print(f"⚠️ WARNING: {entry.provider} URL may be masked or incomplete:\n{entry.oauth_uri}\n")
-            else:
-                print(f"- {entry.provider}: {entry.oauth_uri}")
-        print("-" * 50)
+            data = json.loads(final_result)
+            try:
+                oauth_entries: List[OAuth] = [OAuth(**entry) for entry in data["oauth_providers"]]
+            except Exception as e:
+                raise ValueError(f"결과 파싱 실패: {e}\n원본 결과: {final_result}")

-        # 6) CSV에 저장 (append)
-        csv_file = "./oauth_providers.csv"
-        file_exists = os.path.isfile(csv_file)
-        with open(csv_file, "a", newline="", encoding="utf-8") as f:
-            writer = csv.writer(f)
-            if not file_exists:
-                writer.writerow(["issuer", "provider", "oauth_uri"])
+            # 5) 결과 출력
+            print("-" * 50)
+            print(f"🔗 Scanned URL: {url}\n")
+            print("🔐 Detected OAuth Providers and URLs:")
            for entry in oauth_entries:
-                writer.writerow([url, entry.provider, entry.oauth_uri])
-        print(f"✅ OAuth providers saved to {csv_file}\n")
+                if "<" in entry.oauth_uri or "..." in entry.oauth_uri:
+                    print(f"⚠️ WARNING: {entry.provider} URL may be masked or incomplete:\n{entry.oauth_uri}\n")
+                else:
+                    print(f"- {entry.provider}: {entry.oauth_uri}")
+            print("-" * 50)

-        # 7) Agent와 Browser 닫기
-        await agent.close()           # Agent 내부 작업 정리
-        await context.close()         # 브라우저 컨텍스트 종료 (탭/세션 닫기)
-        await browser.close()         # 실제 브라우저 프로세스 종료
+            # 6) CSV에 저장 (append)
+            csv_file = "./oauth_providers.csv"
+            file_exists = os.path.isfile(csv_file)
+            with open(csv_file, "a", newline="", encoding="utf-8") as f:
+                writer = csv.writer(f)
+                if not file_exists:
+                    writer.writerow(["issuer", "provider", "oauth_uri"])
+                for entry in oauth_entries:
+                    writer.writerow([url, entry.provider, entry.oauth_uri])
+            print(f"✅ OAuth providers saved to {csv_file}\n")
            
-    except Exception as e:
-        print(f"❌ Error scanning {url}: {e}")
-        # 에러 발생 시에도 Agent와 Browser는 닫아야 합니다.
-        await agent.close()
-        await context.close()
-        await browser.close()
+            await clean_resources(agent, context, browser)
+            
+            # 성공적으로 처리했으므로 반복문 탈출
+            break
+        
+        except Exception as e:
+            await clean_resources(agent, context, browser)
+            
+            if try_cnt >= 1:
+                print(f"❌ {url} 스캔에 실패했습니다. 에러: {e}")
+                logger(f"❌ {url} 스캔에 실패했습니다. 에러: {e}")
+                return
+            try_cnt += 1
+            print(f"⚠️ 에러 발생: {e}. {try_cnt}번째 재시도 중...")
+            
+
+            # 1분 대기
+            await asyncio.sleep(5)
+            # 반복문을 통해 재시도
+            continue

 async def loop(filepath: str, start_line: int, end_line: int, skip_html_check: bool = False):
    # 인자값으로 받은 파일 경로와 줄 범위를 통해 도메인 리스트 생성
@ -229,7 +181,7 @@ async def loop(filepath: str, start_line: int, end_line: int, skip_html_check: b
        # scan_one_url은 외부에 정의된 비동기 함수라고 가정합니다.
        # 실제로 scan_one_url이 정의된 위치를 import하거나
        # 모듈 수준에 구현해두셔야 합니다.
-        await scan_one_url(f'http://{url}', skip_html_check=skip_html_check)
+        await scan_one_url(url, skip_html_check=skip_html_check)


 def main():
--- a/run.sh
+++ b/run.sh
@ -34,6 +34,7 @@ while [ "$current" -le "$END_LINE" ]; do
  uv run "$PYTHON_SCRIPT" -f "$DOMAIN_FILE" -s "$current" -e "$chunk_end" -skh $SKH_OPTION

  current=$(( chunk_end + 1 ))
+  sleep 1  # 1초 대기
 done

 echo "모든 청크 처리 완료."