From c514f5615837bf1aaf08b03233c175056bae5dcd Mon Sep 17 00:00:00 2001 From: Felix Ableitner Date: Wed, 20 Oct 2021 17:48:10 +0200 Subject: [PATCH] Added html2md crate to parse comment html from pleroma (fixes #1461) --- Cargo.lock | 71 ++++++++++++++++++++++++++++++ crates/apub/Cargo.toml | 1 + crates/apub/src/objects/comment.rs | 17 ++++--- 3 files changed, 84 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3af6e1bd9..ac7e21db7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -614,6 +614,12 @@ version = "1.0.71" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "79c2681d6594606957bbb8631c4b90a7fcaaa72cdb714743a437b156d6a7eedd" +[[package]] +name = "cesu8" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c" + [[package]] name = "cfg-if" version = "1.0.0" @@ -667,6 +673,16 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b" +[[package]] +name = "combine" +version = "4.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a909e4d93292cd8e9c42e189f61681eff9d67b6541f96b8a1a737f23737bd001" +dependencies = [ + "bytes", + "memchr", +] + [[package]] name = "comrak" version = "0.12.1" @@ -1371,6 +1387,20 @@ version = "3.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a164bb2ceaeff4f42542bdb847c41517c78a60f5649671b2a07312b6e117549" +[[package]] +name = "html2md" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61f6bf799d9770725ec13d66f4af9344e96285dc14d8e71e0fe02d272690667f" +dependencies = [ + "html5ever 0.25.1", + "jni", + "lazy_static", + "markup5ever_rcdom", + "percent-encoding", + "regex", +] + [[package]] name = "html5ever" version = "0.22.5" @@ -1601,6 +1631,26 @@ version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" +[[package]] +name = "jni" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6df18c2e3db7e453d3c6ac5b3e9d5182664d28788126d39b91f2d1e22b017ec" +dependencies = [ + "cesu8", + "combine", + "jni-sys", + "log", + "thiserror", + "walkdir", +] + +[[package]] +name = "jni-sys" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130" + [[package]] name = "jpeg-decoder" version = "0.1.22" @@ -1768,6 +1818,7 @@ dependencies = [ "chrono", "diesel", "futures", + "html2md", "http", "http-signature-normalization-actix", "itertools", @@ -3104,6 +3155,15 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + [[package]] name = "schannel" version = "0.1.19" @@ -4033,6 +4093,17 @@ version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe" +[[package]] +name = "walkdir" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56" +dependencies = [ + "same-file", + "winapi", + "winapi-util", +] + [[package]] name = "want" version = "0.3.0" diff --git a/crates/apub/Cargo.toml b/crates/apub/Cargo.toml index 45ee1c312..10033c4c0 100644 --- a/crates/apub/Cargo.toml +++ b/crates/apub/Cargo.toml @@ -49,6 +49,7 @@ anyhow = "1.0.44" thiserror = "1.0.29" background-jobs = "0.9.0" reqwest = { version = "0.11.4", features = ["json"] } +html2md = "0.2.13" [dev-dependencies] serial_test = "0.5.1" diff --git a/crates/apub/src/objects/comment.rs b/crates/apub/src/objects/comment.rs index cf125ce9f..72d477d10 100644 --- a/crates/apub/src/objects/comment.rs +++ b/crates/apub/src/objects/comment.rs @@ -16,6 +16,7 @@ use activitystreams::{ }; use anyhow::{anyhow, Context}; use chrono::{DateTime, FixedOffset}; +use html2md::parse_html; use lemmy_api_common::blocking; use lemmy_apub_lib::{ traits::{ApubObject, FromApub, ToApub}, @@ -284,12 +285,11 @@ impl FromApub for ApubComment { } let content = if let SourceCompat::Lemmy(source) = ¬e.source { - &source.content + source.content.clone() } else { - // TODO: convert from html to markdown - ¬e.content + parse_html(¬e.content) }; - let content_slurs_removed = remove_slurs(content, &context.settings().slur_regex()); + let content_slurs_removed = remove_slurs(&content, &context.settings().slur_regex()); let form = CommentForm { creator_id: creator.id, @@ -373,8 +373,15 @@ mod tests { .unwrap(); assert_eq!(comment.ap_id.clone().into_inner(), pleroma_url); - assert_eq!(comment.content.len(), 179); + assert_eq!(comment.content.len(), 64); assert!(!comment.local); assert_eq!(request_counter, 0); } + + #[actix_rt::test] + #[serial] + async fn test_html_to_markdown_sanitize() { + let parsed = parse_html(&"hello"); + assert_eq!(parsed, "**hello**"); + } }