From 1d5e470f4e3973942372e866c8d6c6856e58dbc4 Mon Sep 17 00:00:00 2001 From: imnyang Date: Sun, 15 Jun 2025 13:15:41 +0900 Subject: [PATCH] =?UTF-8?q?[Enhancement]=20=EC=A7=84=ED=96=89=20=EC=83=81?= =?UTF-8?q?=ED=99=A9=20=EC=A0=80=EC=9E=A5=20=EB=B0=8F=20=EB=A1=9C=EB=93=9C?= =?UTF-8?q?=20=EA=B8=B0=EB=8A=A5=20=EC=B6=94=EA=B0=80,=20=EC=8A=A4?= =?UTF-8?q?=EC=BA=94=20=EC=A4=91=EB=8B=A8=20=EC=8B=9C=20=EC=B2=98=EB=A6=AC?= =?UTF-8?q?=20=EC=83=81=ED=83=9C=20=EC=B6=9C=EB=A0=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lib/utils/backend_client.py | 3 +- lib/utils/browser_use/__init__.py | 3 +- main.py | 79 ++++++++++++++++++++++++++++++- run.ps1 | 34 ++++--------- run.sh | 18 ++----- 5 files changed, 92 insertions(+), 45 deletions(-) diff --git a/lib/utils/backend_client.py b/lib/utils/backend_client.py index 2d289bd..68f497e 100644 --- a/lib/utils/backend_client.py +++ b/lib/utils/backend_client.py @@ -1,5 +1,6 @@ import requests -from config import BACKEND_URL + +from lib.utils.config import BACKEND_URL def notify_backend(target_url): # Backend에 스캔 시작을 알림 diff --git a/lib/utils/browser_use/__init__.py b/lib/utils/browser_use/__init__.py index de3d347..cde80d6 100644 --- a/lib/utils/browser_use/__init__.py +++ b/lib/utils/browser_use/__init__.py @@ -1,5 +1,4 @@ -from func import * -import clean_resources as clean_resources_func +from lib.utils.browser_use.func import * # Initialize configuration proxy_url = setup_proxy() diff --git a/main.py b/main.py index eb116bb..0585c64 100644 --- a/main.py +++ b/main.py @@ -3,6 +3,8 @@ import json import os import csv import argparse +from pathlib import Path +import signal from dotenv import load_dotenv @@ -31,6 +33,10 @@ load_dotenv(verbose=True, override=True) INITIAL_BACKOFF = int(os.getenv("INITIAL_BACKOFF", "60")) # seconds MAX_BACKOFF = int(os.getenv("MAX_BACKOFF", "600")) # seconds +# 진행 상황 추적을 위한 전역 변수 +current_progress = {"current_index": 0, "total": 0, "current_url": "", "start_line": 0} +progress_file = Path("data/scan_progress.json") + env_cheker() if os.getenv("LMNR_PROJECT_API_KEY"): from lmnr import Laminar @@ -38,6 +44,43 @@ if os.getenv("LMNR_PROJECT_API_KEY"): Laminar.initialize(project_api_key=os.getenv("LMNR_PROJECT_API_KEY")) +def save_progress(): + """현재 진행 상황을 파일에 저장""" + with open(progress_file, 'w', encoding='utf-8') as f: + json.dump(current_progress, f, ensure_ascii=False, indent=2) + + +def load_progress(): + """이전 진행 상황을 파일에서 불러오기""" + if os.path.exists(progress_file): + try: + with open(progress_file, 'r', encoding='utf-8') as f: + return json.load(f) + except: + return None + return None + + +def signal_handler(signum, frame): + """Ctrl+C 시그널 핸들러""" + print("\n" + "="*60) + print("🛑 스캔이 중단되었습니다!") + print(f"📊 진행 상황:") + print(f" - 전체: {current_progress['total']}개 URL") + print(f" - 완료: {current_progress['current_index']}개 URL") + print(f" - 현재 처리 중: {current_progress['current_url']}") + print(f" - domains.txt의 {current_progress['start_line'] + current_progress['current_index']}번째 줄") + print(f" - 진행률: {current_progress['current_index']}/{current_progress['total']} ({current_progress['current_index']/current_progress['total']*100:.1f}%)") + print("="*60) + save_progress() + print(f"💾 진행 상황이 {progress_file}에 저장되었습니다.") + exit(0) + + +# 시그널 핸들러 등록 +signal.signal(signal.SIGINT, signal_handler) + + # ── URL별로 Browser를 새로 띄우는 함수 ── async def scan_one_url(url: str, skip_html_check: bool = False): target_url = url if url.startswith("http") else f"https://{url}" @@ -150,18 +193,50 @@ async def loop( filepath=filepath, start_line=start_line, end_line=end_line ) + # 진행 상황 초기화 + current_progress["total"] = len(target_list) + current_progress["start_line"] = start_line + current_progress["current_index"] = 0 + + # 이전 진행 상황 확인 + prev_progress = load_progress() + if prev_progress and prev_progress.get("start_line") == start_line: + print(f"📋 이전 진행 상황을 발견했습니다:") + print(f" - 이전 완료: {prev_progress['current_index']}/{prev_progress['total']}") + print(f" - 마지막 처리: {prev_progress.get('current_url', 'N/A')}") + + resume = input("이어서 진행하시겠습니까? (y/n): ").lower().strip() + if resume == 'y': + current_progress["current_index"] = prev_progress["current_index"] + target_list = target_list[current_progress["current_index"]:] + print(f"✅ {current_progress['current_index']}번째부터 재개합니다.") + # (필요하다면) 강제 설정이 필요한 경우, 아래 주석을 해제하여 target_list[0] 등을 덮어쓸 수 있습니다. # target_list[0] = "velog.io" for i, url in enumerate(target_list): - print(f"\n🔄 Processing {i+1}/{len(target_list)}: {url}") + actual_index = current_progress["current_index"] + i + current_progress["current_url"] = url + current_progress["current_index"] = actual_index + + print(f"\n🔄 Processing {actual_index + 1}/{current_progress['total']}: {url}") + print(f"📍 domains.txt의 {start_line + actual_index}번째 줄") # URL들 사이에 API 쿼터 회복을 위한 대기 시간 추가 - if i > 0: + if actual_index > 0: print("⏳ API 쿼터 보호를 위해 30초 대기 중...") await asyncio.sleep(30) await scan_one_url(url, skip_html_check=skip_html_check) + + # 진행 상황 저장 + current_progress["current_index"] = actual_index + 1 + save_progress() + + print(f"\n🎉 모든 스캔이 완료되었습니다! ({current_progress['total']}개 URL)") + # 완료 후 진행 상황 파일 삭제 + if os.path.exists(progress_file): + os.remove(progress_file) def main(): diff --git a/run.ps1 b/run.ps1 index 47b101e..6ccf270 100644 --- a/run.ps1 +++ b/run.ps1 @@ -4,9 +4,6 @@ $PYTHON_SCRIPT = "main.py" # 도메인 목록 파일 경로 (Python 스크립트 실행 시 -f 옵션에 전달) $DOMAIN_FILE = "./domains.txt" - -# 몇 줄씩(chunk) 나눠서 실행할지 -$CHUNK_SIZE = 10 # ───────────── # https://f.imnya.ng/.whs/tp-domains/data/domains/latest.txt @@ -26,27 +23,14 @@ $START_LINE = [int]$args[0] $END_LINE = [int]$args[1] $SKIP_HEADER = if ($args.Count -eq 3) { $args[2] } else { "False" } -# START_LINE부터 END_LINE까지 CHUNK_SIZE 만큼씩 반복 -$current = $START_LINE -while ($current -le $END_LINE) { - # 각 청크 구간의 마지막 줄 계산 - $chunk_end = $current + $CHUNK_SIZE - 1 - if ($chunk_end -gt $END_LINE) { - $chunk_end = $END_LINE - } +$timestamp = Get-Date -Format "yyyy-MM-dd HH:mm:ss" +Write-Host "[$timestamp] Processing lines $START_LINE to $END_LINE..." - $timestamp = Get-Date -Format "yyyy-MM-dd HH:mm:ss" - Write-Host "[$timestamp] Processing lines $current to $chunk_end..." - - # Python 스크립트 실행 - # -f DOMAIN_FILE: 도메인 목록 파일 경로 - # -s current : 읽기 시작 줄 - # -e chunk_end: 읽기 끝 줄 - # -skh SKIP_HEADER: 헤더 스킵 여부 - uv run $PYTHON_SCRIPT -f $DOMAIN_FILE -s $current -e $chunk_end -skh $SKIP_HEADER +# Python 스크립트 실행 +# -f DOMAIN_FILE: 도메인 목록 파일 경로 +# -s START_LINE : 읽기 시작 줄 +# -e END_LINE : 읽기 끝 줄 +# -skh SKIP_HEADER: 헤더 스킵 여부 +uv run $PYTHON_SCRIPT -f $DOMAIN_FILE -s $START_LINE -e $END_LINE -skh $SKIP_HEADER - # 다음 청크의 시작 값 설정 - $current = $chunk_end + 1 -} - -Write-Host "모든 청크 처리 완료." +Write-Host "처리 완료." diff --git a/run.sh b/run.sh index b78ca57..cd79cb1 100755 --- a/run.sh +++ b/run.sh @@ -3,7 +3,6 @@ # ── 설정 부분 ── PYTHON_SCRIPT="main.py" DOMAIN_FILE="./domains.txt" -CHUNK_SIZE=10 # ───────────── curl "https://f.imnya.ng/.whs/tp-domains/data/domains/latest.txt" -o $DOMAIN_FILE @@ -23,18 +22,7 @@ if [ -z "$SKH_OPTION" ]; then SKH_OPTION="False" fi -current=$START_LINE -while [ "$current" -le "$END_LINE" ]; do - chunk_end=$(( current + CHUNK_SIZE - 1 )) - if [ "$chunk_end" -gt "$END_LINE" ]; then - chunk_end=$END_LINE - fi +echo "[$(date '+%Y-%m-%d %H:%M:%S')] Processing lines ${START_LINE} to ${END_LINE}..." +uv run "$PYTHON_SCRIPT" -f "$DOMAIN_FILE" -s "$START_LINE" -e "$END_LINE" -skh $SKH_OPTION - echo "[$(date '+%Y-%m-%d %H:%M:%S')] Processing lines ${current} to ${chunk_end}..." - uv run "$PYTHON_SCRIPT" -f "$DOMAIN_FILE" -s "$current" -e "$chunk_end" -skh $SKH_OPTION - - current=$(( chunk_end + 1 )) - sleep 1 # 1초 대기 -done - -echo "모든 청크 처리 완료." +echo "처리 완료."