Implement HTML domain checker in Rust and add PowerShell script for chunked execution

- Added a Rust program that reads a list of domains from "domains.txt", checks if they return HTML content, and writes valid domains to "domains-filtered.txt".
- Introduced a PowerShell script to execute a Python script in chunks, allowing for processing of specified line ranges from the domain list.
This commit is contained in:
imnyang 2025-06-06 23:47:59 +09:00
commit 351af7ba78
9 changed files with 36159 additions and 6 deletions

71
is-html-fast/.gitignore vendored Normal file
View file

@ -0,0 +1,71 @@
# Generated by Cargo
# will have compiled files and executables
debug/
target/
# These are backup files generated by rustfmt
**/*.rs.bk
# MSVC Windows builds of rustc generate these, which store debugging information
*.pdb
# Generated by cargo mutants
# Contains mutation testing data
**/mutants.out*/
# RustRover
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
# General
.DS_Store
.AppleDouble
.LSOverride
Icon[]
# Thumbnails
._*
# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
# Windows thumbnail cache files
Thumbs.db
Thumbs.db:encryptable
ehthumbs.db
ehthumbs_vista.db
# Dump file
*.stackdump
# Folder config file
[Dd]esktop.ini
# Recycle Bin used on file shares
$RECYCLE.BIN/
# Windows Installer files
*.cab
*.msi
*.msix
*.msm
*.msp
# Windows shortcuts
*.lnk

1581
is-html-fast/Cargo.lock generated Normal file

File diff suppressed because it is too large Load diff

8
is-html-fast/Cargo.toml Normal file
View file

@ -0,0 +1,8 @@
[package]
name = "is-html-fast"
version = "0.1.0"
edition = "2024"
[dependencies]
rayon = "1.10.0"
reqwest = { version = "0.12.19", features = ["blocking", "json"]}

2
is-html-fast/README.md Normal file
View file

@ -0,0 +1,2 @@
실제로 사용되진 않습니다.
일회용 코드입니다.

34377
is-html-fast/domains.txt Normal file

File diff suppressed because it is too large Load diff

59
is-html-fast/src/main.rs Normal file
View file

@ -0,0 +1,59 @@
use std::fs::{File, OpenOptions};
use std::io::{BufRead, BufReader, Write};
use std::sync::{Arc, Mutex};
use std::time::Duration;
use rayon::prelude::*;
use reqwest::blocking::Client;
use reqwest::header::CONTENT_TYPE;
fn main() -> Result<(), Box<dyn std::error::Error>> {
let input_file = File::open("domains.txt")?;
let reader = BufReader::new(input_file);
let domains: Vec<String> = reader.lines().filter_map(Result::ok).collect();
let output_file = OpenOptions::new()
.create(true)
.write(true)
.truncate(true)
.open("domains-filtered.txt")?;
let output = Arc::new(Mutex::new(output_file));
let client = Arc::new(
Client::builder()
.timeout(Duration::from_secs(5))
.build()?,
);
domains.par_iter().for_each(|domain| {
let url = format!("https://{}", domain);
println!("Checking {}", url);
let response = client.get(&url).send();
match response {
Ok(resp) => {
if let Some(content_type) = resp.headers().get(CONTENT_TYPE) {
if let Ok(content_type_str) = content_type.to_str() {
if content_type_str.starts_with("text/html") {
if let Ok(mut file) = output.lock() {
writeln!(file, "{}", domain).ok();
}
println!("✅ HTML: {}", domain);
} else {
println!("❌ Not HTML: {} ({})", domain, content_type_str);
}
}
} else {
println!("❌ No Content-Type: {}", domain);
}
}
Err(_) => {
println!("⚠️ Failed to connect: {}", domain);
}
}
});
Ok(())
}