Remove is-html-fast project files and update browser profile initialization to ignore HTTPS errors and enable specific command-line arguments.

This commit is contained in:
암냥 2025-07-13 22:13:18 +09:00
commit 319c5bb72f
6 changed files with 47 additions and 1800 deletions

View file

@ -1,71 +0,0 @@
# Generated by Cargo
# will have compiled files and executables
debug/
target/
# These are backup files generated by rustfmt
**/*.rs.bk
# MSVC Windows builds of rustc generate these, which store debugging information
*.pdb
# Generated by cargo mutants
# Contains mutation testing data
**/mutants.out*/
# RustRover
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
# General
.DS_Store
.AppleDouble
.LSOverride
Icon[]
# Thumbnails
._*
# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
# Windows thumbnail cache files
Thumbs.db
Thumbs.db:encryptable
ehthumbs.db
ehthumbs_vista.db
# Dump file
*.stackdump
# Folder config file
[Dd]esktop.ini
# Recycle Bin used on file shares
$RECYCLE.BIN/
# Windows Installer files
*.cab
*.msi
*.msix
*.msm
*.msp
# Windows shortcuts
*.lnk

1581
is-html-fast/Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -1,8 +0,0 @@
[package]
name = "is-html-fast"
version = "0.1.0"
edition = "2024"
[dependencies]
rayon = "1.10.0"
reqwest = { version = "0.12.19", features = ["blocking", "json"]}

View file

@ -1,2 +0,0 @@
실제로 사용되진 않습니다.
일회용 코드입니다.

View file

@ -1,92 +0,0 @@
use std::fs::{File, OpenOptions};
use std::io::{BufRead, BufReader, Write};
use std::sync::{Arc, Mutex};
use std::time::Duration;
use std::sync::atomic::{AtomicUsize, Ordering};
use rayon::prelude::*;
use reqwest::blocking::Client;
use reqwest::header::{HeaderMap, HeaderValue, CONTENT_TYPE, USER_AGENT, ACCEPT, ACCEPT_LANGUAGE, ACCEPT_ENCODING, CONNECTION, UPGRADE_INSECURE_REQUESTS};
fn main() -> Result<(), Box<dyn std::error::Error>> {
let input_file = File::open("domains.txt")?;
let reader = BufReader::new(input_file);
let domains: Vec<String> = reader.lines().filter_map(Result::ok).collect();
let total_count = domains.len();
let counter = Arc::new(AtomicUsize::new(0));
let html_count = Arc::new(AtomicUsize::new(0));
let failed_count = Arc::new(AtomicUsize::new(0));
let non_html_count = Arc::new(AtomicUsize::new(0));
let output_file = OpenOptions::new()
.create(true)
.write(true)
.truncate(true)
.open("domains-filtered.txt")?;
let output = Arc::new(Mutex::new(output_file));
// 브라우저 헤더 세팅
let mut headers = HeaderMap::new();
headers.insert(USER_AGENT, HeaderValue::from_static("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0"));
headers.insert(ACCEPT, HeaderValue::from_static("text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"));
headers.insert(ACCEPT_LANGUAGE, HeaderValue::from_static("ko,en-US;q=0.7,en;q=0.3"));
headers.insert(ACCEPT_ENCODING, HeaderValue::from_static("gzip, deflate, br"));
headers.insert(CONNECTION, HeaderValue::from_static("keep-alive"));
headers.insert(UPGRADE_INSECURE_REQUESTS, HeaderValue::from_static("1"));
let client = Arc::new(
Client::builder()
.timeout(Duration::from_secs(5))
.default_headers(headers)
.build()?,
);
domains.par_iter().for_each(|domain| {
let current = counter.fetch_add(1, Ordering::SeqCst) + 1;
let url = format!("https://{}", domain);
let response = client.get(&url).send();
match response {
Ok(resp) => {
if let Some(content_type) = resp.headers().get(CONTENT_TYPE) {
if let Ok(content_type_str) = content_type.to_str() {
if content_type_str.starts_with("text/html") {
if let Ok(mut file) = output.lock() {
writeln!(file, "{}", domain).ok();
}
html_count.fetch_add(1, Ordering::SeqCst);
println!("[{}/{}] ✅ HTML: {}", current, total_count, domain);
} else {
non_html_count.fetch_add(1, Ordering::SeqCst);
println!("[{}/{}] ❌ Not HTML: {} ({})", current, total_count, domain, content_type_str);
}
}
} else {
non_html_count.fetch_add(1, Ordering::SeqCst);
println!("[{}/{}] ❌ No Content-Type: {}", current, total_count, domain);
}
}
Err(_) => {
failed_count.fetch_add(1, Ordering::SeqCst);
println!("[{}/{}] ⚠️ Failed to connect: {}", current, total_count, domain);
}
}
});
// Final results
let html_final = html_count.load(Ordering::SeqCst);
let failed_final = failed_count.load(Ordering::SeqCst);
let non_html_final = non_html_count.load(Ordering::SeqCst);
println!("\n=== Final Results ===");
println!("📊 Total domains: {}", total_count);
println!("✅ HTML domains: {} ({:.1}%)", html_final, (html_final as f64 / total_count as f64) * 100.0);
println!("❌ Non-HTML domains: {} ({:.1}%)", non_html_final, (non_html_final as f64 / total_count as f64) * 100.0);
println!("⚠️ Failed connections: {} ({:.1}%)", failed_final, (failed_final as f64 / total_count as f64) * 100.0);
println!("💾 HTML domains saved to: domains-filtered.txt");
Ok(())
}

View file

@ -66,6 +66,7 @@ async def GetProfile(headless=False):
profile = BrowserProfile(
# Security settings
# disable_security=True,
ignore_https_errors=True,
# Display settings
headless=headless,
# Data persistence
@ -78,52 +79,52 @@ async def GetProfile(headless=False):
# "--disable-features=Translate,PasswordManagerDefaultEnabled",
],
ignore_default_args=[
# "--disable-datasaver-prompt",
# "--disable-component-extensions-with-background-pages",
# "--disable-prompt-on-repost",
# "--safeBrowse-disable-auto-update",
# "--install-autogenerated-theme=0,0,0",
# "--disable-speech-synthesis-api",
# "--ash-no-nudges",
# "--test-type=gpu",
# "--noerrdialogs",
# "--disable-external-intent-requests",
# "--disable-breakpad",
# "--disable-backgrounding-occluded-windows",
# "--export-tagged-pdf",
# "--disable-focus-on-load",
# "--suppress-message-center-popups",
# "--disable-renderer-backgrounding",
# "--hide-crash-restore-bubble",
# "--disable-back-forward-cache",
# "--allow-legacy-extension-manifests",
# # "--disable-field-trial-config", # 왜 이걸 끄면 웹사이트가 압축된 형태로 보이는 진 모르곘음
# "--disable-popup-blocking",
# "--disable-background-networking",
# "--no-first-run",
# "--disable-blink-features=AutomationControlled",
# "--password-store=basic",
# "--enable-network-information-downlink-max",
# "--allow-pre-commit-input",
# "--enable-features=NetworkService,NetworkServiceInProcess",
# "--metrics-recording-only",
# "--silent-debugger-extension-api",
# "--disable-features=AcceptCHFrame,AutoExpandDetailsElement,AvoidUnnecessaryBeforeUnloadCheckSync,CertificateTransparencyComponentUpdater,DestroyProfileOnBrowserClose,DialMediaRouteProvider,ExtensionManifestV2Disabled,GlobalMediaControls,HttpsUpgrades,ImprovedCookieControls,LazyFrameLoading,LensOverlay,MediaRouter,PaintHolding,ThirdPartyStoragePartitioning,Translate,AutomationControlled,BackForwardCache,OptimizationHints,ProcessPerSiteUpToMainFrameThreshold,InterestFeedContentSuggestions,CalculateNativeWinOcclusion,HeavyAdPrivacyMitigations,PrivacySandboxSettings4,AutofillServerCommunication,CrashReporting,OverscrollHistoryNavigation,InfiniteSessionRestore,ExtensionDisableUnsupportedDeveloper",
# "--disable-ipc-flooding-protection",
# "--disable-hang-monitor",
# "--disable-dev-shm-usage",
# "--disable-client-side-phishing-detection",
# "--log-level=2",
# "--generate-pdf-document-outline",
# "--disable-speech-api",
# "--disable-search-engine-choice-screen",
# "--no-service-autorun",
# "--no-pings",
# "--disable-component-update",
# '--simulate-outdated-no-au="Tue, 31 Dec 2099 23:59:59 GMT"',
# "--disable-background-timer-throttling",
# "--use-mock-keychain",
# "--disable-features=IsolateOrigins,site-per-process",
"--disable-datasaver-prompt",
"--disable-component-extensions-with-background-pages",
"--disable-prompt-on-repost",
"--safeBrowse-disable-auto-update",
"--install-autogenerated-theme=0,0,0",
"--disable-speech-synthesis-api",
"--ash-no-nudges",
"--test-type=gpu",
"--noerrdialogs",
"--disable-external-intent-requests",
"--disable-breakpad",
"--disable-backgrounding-occluded-windows",
"--export-tagged-pdf",
"--disable-focus-on-load",
"--suppress-message-center-popups",
"--disable-renderer-backgrounding",
"--hide-crash-restore-bubble",
"--disable-back-forward-cache",
"--allow-legacy-extension-manifests",
# "--disable-field-trial-config", # 왜 이걸 끄면 웹사이트가 압축된 형태로 보이는 진 모르곘음
"--disable-popup-blocking",
"--disable-background-networking",
"--no-first-run",
"--disable-blink-features=AutomationControlled",
"--password-store=basic",
"--enable-network-information-downlink-max",
"--allow-pre-commit-input",
"--enable-features=NetworkService,NetworkServiceInProcess",
"--metrics-recording-only",
"--silent-debugger-extension-api",
"--disable-features=AcceptCHFrame,AutoExpandDetailsElement,AvoidUnnecessaryBeforeUnloadCheckSync,CertificateTransparencyComponentUpdater,DestroyProfileOnBrowserClose,DialMediaRouteProvider,ExtensionManifestV2Disabled,GlobalMediaControls,HttpsUpgrades,ImprovedCookieControls,LazyFrameLoading,LensOverlay,MediaRouter,PaintHolding,ThirdPartyStoragePartitioning,Translate,AutomationControlled,BackForwardCache,OptimizationHints,ProcessPerSiteUpToMainFrameThreshold,InterestFeedContentSuggestions,CalculateNativeWinOcclusion,HeavyAdPrivacyMitigations,PrivacySandboxSettings4,AutofillServerCommunication,CrashReporting,OverscrollHistoryNavigation,InfiniteSessionRestore,ExtensionDisableUnsupportedDeveloper",
"--disable-ipc-flooding-protection",
"--disable-hang-monitor",
"--disable-dev-shm-usage",
"--disable-client-side-phishing-detection",
"--log-level=2",
"--generate-pdf-document-outline",
"--disable-speech-api",
"--disable-search-engine-choice-screen",
"--no-service-autorun",
"--no-pings",
"--disable-component-update",
'--simulate-outdated-no-au="Tue, 31 Dec 2099 23:59:59 GMT"',
"--disable-background-timer-throttling",
"--use-mock-keychain",
"--disable-features=IsolateOrigins,site-per-process",
# 아래는 기존 예시에 있던 인자들입니다. 필요에 따라 유지하거나 제거하세요.
"--enable-automation",
"--disable-extensions",