feat: add html2text dependency and implement HTML to Markdown conversion in email body processing

This commit is contained in:
암냥 2026-01-22 16:25:33 +09:00
commit 76d77d897c
No known key found for this signature in database
3 changed files with 238 additions and 3 deletions

217
Cargo.lock generated
View file

@ -305,6 +305,28 @@ version = "0.16.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
[[package]]
name = "html2text"
version = "0.16.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ea4ba4c0f993633569337a4fadfbd4f27448c81adb1af0c2407065f52809662"
dependencies = [
"html5ever",
"tendril",
"thiserror",
"unicode-width",
]
[[package]]
name = "html5ever"
version = "0.37.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5935f02fdc02823ff15fec27c2b3d7ca19d629e996f7a0ae4d7d500e62e54c76"
dependencies = [
"log",
"markup5ever",
]
[[package]] [[package]]
name = "http" name = "http"
version = "0.2.12" version = "0.2.12"
@ -596,6 +618,15 @@ version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77"
[[package]]
name = "lock_api"
version = "0.4.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965"
dependencies = [
"scopeguard",
]
[[package]] [[package]]
name = "log" name = "log"
version = "0.4.29" version = "0.4.29"
@ -613,6 +644,17 @@ dependencies = [
"quoted_printable", "quoted_printable",
] ]
[[package]]
name = "markup5ever"
version = "0.37.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7cfb33ea12d5d83b1ba9a55ae7d05faec4f2189d47b79c04d4cea6bbe9f5b083"
dependencies = [
"log",
"tendril",
"web_atoms",
]
[[package]] [[package]]
name = "memchr" name = "memchr"
version = "2.7.6" version = "2.7.6"
@ -653,14 +695,22 @@ dependencies = [
"tempfile", "tempfile",
] ]
[[package]]
name = "new_debug_unreachable"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
[[package]] [[package]]
name = "newsletter" name = "newsletter"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"chrono", "chrono",
"html2text",
"imap", "imap",
"mailparse", "mailparse",
"native-tls", "native-tls",
"regex",
"reqwest", "reqwest",
"serde", "serde",
"serde_json", "serde_json",
@ -747,12 +797,74 @@ dependencies = [
"vcpkg", "vcpkg",
] ]
[[package]]
name = "parking_lot"
version = "0.12.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a"
dependencies = [
"lock_api",
"parking_lot_core",
]
[[package]]
name = "parking_lot_core"
version = "0.9.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1"
dependencies = [
"cfg-if",
"libc",
"redox_syscall",
"smallvec",
"windows-link",
]
[[package]] [[package]]
name = "percent-encoding" name = "percent-encoding"
version = "2.3.2" version = "2.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220"
[[package]]
name = "phf"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf"
dependencies = [
"phf_shared",
"serde",
]
[[package]]
name = "phf_codegen"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49aa7f9d80421bca176ca8dbfebe668cc7a2684708594ec9f3c0db0805d5d6e1"
dependencies = [
"phf_generator",
"phf_shared",
]
[[package]]
name = "phf_generator"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737"
dependencies = [
"fastrand",
"phf_shared",
]
[[package]]
name = "phf_shared"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266"
dependencies = [
"siphasher",
]
[[package]] [[package]]
name = "pin-project-lite" name = "pin-project-lite"
version = "0.2.16" version = "0.2.16"
@ -780,6 +892,12 @@ dependencies = [
"zerovec", "zerovec",
] ]
[[package]]
name = "precomputed-hash"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
[[package]] [[package]]
name = "proc-macro2" name = "proc-macro2"
version = "1.0.105" version = "1.0.105"
@ -810,6 +928,15 @@ version = "5.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
[[package]]
name = "redox_syscall"
version = "0.5.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d"
dependencies = [
"bitflags 2.10.0",
]
[[package]] [[package]]
name = "regex" name = "regex"
version = "1.12.2" version = "1.12.2"
@ -922,6 +1049,12 @@ dependencies = [
"windows-sys 0.61.2", "windows-sys 0.61.2",
] ]
[[package]]
name = "scopeguard"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]] [[package]]
name = "security-framework" name = "security-framework"
version = "2.11.1" version = "2.11.1"
@ -1015,6 +1148,12 @@ version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
[[package]]
name = "siphasher"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d"
[[package]] [[package]]
name = "slab" name = "slab"
version = "0.4.11" version = "0.4.11"
@ -1059,6 +1198,30 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
[[package]]
name = "string_cache"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a18596f8c785a729f2819c0f6a7eae6ebeebdfffbfe4214ae6b087f690e31901"
dependencies = [
"new_debug_unreachable",
"parking_lot",
"phf_shared",
"precomputed-hash",
]
[[package]]
name = "string_cache_codegen"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "585635e46db231059f76c5849798146164652513eb9e8ab2685939dd90f29b69"
dependencies = [
"phf_generator",
"phf_shared",
"proc-macro2",
"quote",
]
[[package]] [[package]]
name = "syn" name = "syn"
version = "2.0.114" version = "2.0.114"
@ -1121,6 +1284,36 @@ dependencies = [
"windows-sys 0.61.2", "windows-sys 0.61.2",
] ]
[[package]]
name = "tendril"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c4790fc369d5a530f4b544b094e31388b9b3a37c0f4652ade4505945f5660d24"
dependencies = [
"new_debug_unreachable",
"utf-8",
]
[[package]]
name = "thiserror"
version = "2.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "2.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]] [[package]]
name = "tinystr" name = "tinystr"
version = "0.8.2" version = "0.8.2"
@ -1246,6 +1439,12 @@ version = "1.0.22"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5"
[[package]]
name = "unicode-width"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254"
[[package]] [[package]]
name = "url" name = "url"
version = "2.5.8" version = "2.5.8"
@ -1258,6 +1457,12 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "utf-8"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]] [[package]]
name = "utf8_iter" name = "utf8_iter"
version = "1.0.4" version = "1.0.4"
@ -1369,6 +1574,18 @@ dependencies = [
"wasm-bindgen", "wasm-bindgen",
] ]
[[package]]
name = "web_atoms"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30e588f10c7bc3465f5fc1ab087fc97877ec1064a7ec89fb685ac4ee998dac4a"
dependencies = [
"phf",
"phf_codegen",
"string_cache",
"string_cache_codegen",
]
[[package]] [[package]]
name = "windows-core" name = "windows-core"
version = "0.62.2" version = "0.62.2"

View file

@ -12,6 +12,8 @@ serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0" serde_json = "1.0"
toml = "0.8" toml = "0.8"
chrono = { version = "0.4", features = ["serde"] } chrono = { version = "0.4", features = ["serde"] }
html2text = "0.16.6"
regex = "1.12.2"

View file

@ -1,5 +1,6 @@
use mailparse::MailHeaderMap; use mailparse::MailHeaderMap;
use native_tls::TlsConnector; use native_tls::TlsConnector;
use regex::Regex;
use serde::Deserialize; use serde::Deserialize;
use std::fs; use std::fs;
use std::thread; use std::thread;
@ -139,9 +140,21 @@ fn run_monitor(config: &Config) -> Result<(), Box<dyn std::error::Error>> {
} }
} }
fn clean_body(body: &str) -> String {
// Replace multiple newlines with double newline (max)
let re_newlines = Regex::new(r"\n{3,}").unwrap();
let body = re_newlines.replace_all(body, "\n\n");
// Trim trailing spaces from each line
let re_trailing_spaces = Regex::new(r"(?m)[ \t]+$").unwrap();
let body = re_trailing_spaces.replace_all(&body, "");
body.trim().to_string()
}
fn extract_body(parsed: &mailparse::ParsedMail) -> Option<String> { fn extract_body(parsed: &mailparse::ParsedMail) -> Option<String> {
if parsed.ctype.mimetype == "text/plain" { if parsed.ctype.mimetype == "text/plain" {
return parsed.get_body().ok(); return parsed.get_body().ok().map(|s| clean_body(&s));
} }
// If multipart, search for text/plain // If multipart, search for text/plain
@ -153,8 +166,11 @@ fn extract_body(parsed: &mailparse::ParsedMail) -> Option<String> {
// Fallback to text/html if no plain text found (or first part if nothing else) // Fallback to text/html if no plain text found (or first part if nothing else)
if parsed.ctype.mimetype == "text/html" { if parsed.ctype.mimetype == "text/html" {
// Naive stripping of HTML tags could be done here, or just return as is. if let Ok(html_content) = parsed.get_body() {
return parsed.get_body().ok(); if let Ok(md) = html2text::from_read(html_content.as_bytes(), 80) {
return Some(clean_body(&md));
}
}
} }
None None