mirror of
https://github.com/j93es/browser-use-oauth.git
synced 2026-06-04 02:41:53 +09:00
Implement HTML domain checker in Rust and add PowerShell script for chunked execution
- Added a Rust program that reads a list of domains from "domains.txt", checks if they return HTML content, and writes valid domains to "domains-filtered.txt". - Introduced a PowerShell script to execute a Python script in chunks, allowing for processing of specified line ranges from the domain list.
This commit is contained in:
parent
c6ccc514b1
commit
351af7ba78
9 changed files with 36159 additions and 6 deletions
71
is-html-fast/.gitignore
vendored
Normal file
71
is-html-fast/.gitignore
vendored
Normal file
|
|
@ -0,0 +1,71 @@
|
|||
# Generated by Cargo
|
||||
# will have compiled files and executables
|
||||
debug/
|
||||
target/
|
||||
|
||||
# These are backup files generated by rustfmt
|
||||
**/*.rs.bk
|
||||
|
||||
# MSVC Windows builds of rustc generate these, which store debugging information
|
||||
*.pdb
|
||||
|
||||
# Generated by cargo mutants
|
||||
# Contains mutation testing data
|
||||
**/mutants.out*/
|
||||
|
||||
# RustRover
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
||||
# General
|
||||
.DS_Store
|
||||
.AppleDouble
|
||||
.LSOverride
|
||||
Icon[]
|
||||
|
||||
# Thumbnails
|
||||
._*
|
||||
|
||||
# Files that might appear in the root of a volume
|
||||
.DocumentRevisions-V100
|
||||
.fseventsd
|
||||
.Spotlight-V100
|
||||
.TemporaryItems
|
||||
.Trashes
|
||||
.VolumeIcon.icns
|
||||
.com.apple.timemachine.donotpresent
|
||||
|
||||
# Directories potentially created on remote AFP share
|
||||
.AppleDB
|
||||
.AppleDesktop
|
||||
Network Trash Folder
|
||||
Temporary Items
|
||||
.apdisk
|
||||
|
||||
# Windows thumbnail cache files
|
||||
Thumbs.db
|
||||
Thumbs.db:encryptable
|
||||
ehthumbs.db
|
||||
ehthumbs_vista.db
|
||||
|
||||
# Dump file
|
||||
*.stackdump
|
||||
|
||||
# Folder config file
|
||||
[Dd]esktop.ini
|
||||
|
||||
# Recycle Bin used on file shares
|
||||
$RECYCLE.BIN/
|
||||
|
||||
# Windows Installer files
|
||||
*.cab
|
||||
*.msi
|
||||
*.msix
|
||||
*.msm
|
||||
*.msp
|
||||
|
||||
# Windows shortcuts
|
||||
*.lnk
|
||||
1581
is-html-fast/Cargo.lock
generated
Normal file
1581
is-html-fast/Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
8
is-html-fast/Cargo.toml
Normal file
8
is-html-fast/Cargo.toml
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
[package]
|
||||
name = "is-html-fast"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
rayon = "1.10.0"
|
||||
reqwest = { version = "0.12.19", features = ["blocking", "json"]}
|
||||
2
is-html-fast/README.md
Normal file
2
is-html-fast/README.md
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
실제로 사용되진 않습니다.
|
||||
일회용 코드입니다.
|
||||
34377
is-html-fast/domains.txt
Normal file
34377
is-html-fast/domains.txt
Normal file
File diff suppressed because it is too large
Load diff
59
is-html-fast/src/main.rs
Normal file
59
is-html-fast/src/main.rs
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
use std::fs::{File, OpenOptions};
|
||||
use std::io::{BufRead, BufReader, Write};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::Duration;
|
||||
|
||||
use rayon::prelude::*;
|
||||
use reqwest::blocking::Client;
|
||||
use reqwest::header::CONTENT_TYPE;
|
||||
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let input_file = File::open("domains.txt")?;
|
||||
let reader = BufReader::new(input_file);
|
||||
let domains: Vec<String> = reader.lines().filter_map(Result::ok).collect();
|
||||
|
||||
let output_file = OpenOptions::new()
|
||||
.create(true)
|
||||
.write(true)
|
||||
.truncate(true)
|
||||
.open("domains-filtered.txt")?;
|
||||
|
||||
let output = Arc::new(Mutex::new(output_file));
|
||||
|
||||
let client = Arc::new(
|
||||
Client::builder()
|
||||
.timeout(Duration::from_secs(5))
|
||||
.build()?,
|
||||
);
|
||||
|
||||
domains.par_iter().for_each(|domain| {
|
||||
let url = format!("https://{}", domain);
|
||||
println!("Checking {}", url);
|
||||
|
||||
let response = client.get(&url).send();
|
||||
|
||||
match response {
|
||||
Ok(resp) => {
|
||||
if let Some(content_type) = resp.headers().get(CONTENT_TYPE) {
|
||||
if let Ok(content_type_str) = content_type.to_str() {
|
||||
if content_type_str.starts_with("text/html") {
|
||||
if let Ok(mut file) = output.lock() {
|
||||
writeln!(file, "{}", domain).ok();
|
||||
}
|
||||
println!("✅ HTML: {}", domain);
|
||||
} else {
|
||||
println!("❌ Not HTML: {} ({})", domain, content_type_str);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
println!("❌ No Content-Type: {}", domain);
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
println!("⚠️ Failed to connect: {}", domain);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue