diff --git a/Cargo.lock b/Cargo.lock index 5f649541c..1b25d96ab 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2591,6 +2591,7 @@ dependencies = [ "lemmy_db_views_actor", "lemmy_db_views_moderator", "lemmy_utils", + "mime", "once_cell", "regex", "reqwest", diff --git a/crates/api/src/post/get_link_metadata.rs b/crates/api/src/post/get_link_metadata.rs index 7ab6e9a92..babceedb4 100644 --- a/crates/api/src/post/get_link_metadata.rs +++ b/crates/api/src/post/get_link_metadata.rs @@ -2,7 +2,7 @@ use actix_web::web::{Data, Json}; use lemmy_api_common::{ context::LemmyContext, post::{GetSiteMetadata, GetSiteMetadataResponse}, - request::fetch_site_metadata, + request::fetch_link_metadata, }; use lemmy_utils::error::LemmyError; @@ -11,7 +11,7 @@ pub async fn get_link_metadata( data: Json, context: Data, ) -> Result, LemmyError> { - let metadata = fetch_site_metadata(context.client(), &data.url).await?; + let metadata = fetch_link_metadata(&data.url, false, &context).await?; Ok(Json(GetSiteMetadataResponse { metadata })) } diff --git a/crates/api_common/Cargo.toml b/crates/api_common/Cargo.toml index 6a6264804..b1f4cca71 100644 --- a/crates/api_common/Cargo.toml +++ b/crates/api_common/Cargo.toml @@ -70,6 +70,7 @@ jsonwebtoken = { version = "8.3.0", optional = true } # necessary for wasmt compilation getrandom = { version = "0.2.10", features = ["js"] } task-local-extensions = "0.1.4" +mime = "0.3.17" [dev-dependencies] serial_test = { workspace = true } diff --git a/crates/api_common/src/post.rs b/crates/api_common/src/post.rs index c7ee08983..62a4063eb 100644 --- a/crates/api_common/src/post.rs +++ b/crates/api_common/src/post.rs @@ -238,17 +238,20 @@ pub struct GetSiteMetadata { #[cfg_attr(feature = "full", ts(export))] /// The site metadata response. pub struct GetSiteMetadataResponse { - pub metadata: SiteMetadata, + pub metadata: LinkMetadata, } #[skip_serializing_none] -#[derive(Debug, Deserialize, Serialize, PartialEq, Eq, Clone)] +#[derive(Debug, Deserialize, Serialize, PartialEq, Eq, Clone, Default)] #[cfg_attr(feature = "full", derive(TS))] #[cfg_attr(feature = "full", ts(export))] /// Site metadata, from its opengraph tags. -pub struct SiteMetadata { +pub struct LinkMetadata { pub title: Option, pub description: Option, pub(crate) image: Option, pub embed_video_url: Option, + pub content_type: Option, + #[serde(skip)] + pub thumbnail: Option, } diff --git a/crates/api_common/src/request.rs b/crates/api_common/src/request.rs index 436e34c78..b22f3ccfc 100644 --- a/crates/api_common/src/request.rs +++ b/crates/api_common/src/request.rs @@ -1,65 +1,20 @@ -use crate::{context::LemmyContext, post::SiteMetadata}; +use crate::{context::LemmyContext, post::LinkMetadata}; use encoding::{all::encodings, DecoderTrap}; -use lemmy_db_schema::newtypes::DbUrl; use lemmy_utils::{ error::{LemmyError, LemmyErrorType}, settings::structs::Settings, version::VERSION, REQWEST_TIMEOUT, }; -use reqwest::{Client, ClientBuilder}; +use mime::Mime; +use reqwest::{header::CONTENT_TYPE, Client, ClientBuilder}; use reqwest_middleware::ClientWithMiddleware; -use serde::{de::DeserializeOwned, Deserialize, Deserializer}; +use serde::Deserialize; use tracing::info; use url::Url; use urlencoding::encode; use webpage::HTML; -/// Both are options, since the URL might be either an html page, or an image -/// Returns the SiteMetadata, and an image URL, if there is a picture associated -#[tracing::instrument(skip_all)] -pub async fn fetch_site_data( - url: Option<&Url>, - include_image: bool, - context: &LemmyContext, -) -> (Option, Option) { - match &url { - Some(url) => { - // Fetch metadata - // Ignore errors, since it may be an image, or not have the data. - // Warning, this may ignore SSL errors - let metadata_option = fetch_site_metadata(context.client(), url).await.ok(); - if !include_image { - (metadata_option, None) - } else { - let thumbnail_url = fetch_pictrs_url_from_site_metadata(&metadata_option, url, &context) - .await - .ok(); - (metadata_option, thumbnail_url) - } - } - None => (None, None), - } -} - -/// Fetches the post link html tags (like title, description, image, etc) -#[tracing::instrument(skip_all)] -pub async fn fetch_site_metadata( - client: &ClientWithMiddleware, - url: &Url, -) -> Result { - info!("Fetching site metadata for url: {}", url); - let response = client.get(url.as_str()).send().await?; - - // Can't use .text() here, because it only checks the content header, not the actual bytes - // https://github.com/LemmyNet/lemmy/issues/1964 - let html_bytes = response.bytes().await.map_err(LemmyError::from)?.to_vec(); - - let tags = html_to_site_metadata(&html_bytes, url)?; - - Ok(tags) -} - pub fn client_builder(settings: &Settings) -> ClientBuilder { let user_agent = format!( "Lemmy/{}; +{}", @@ -73,7 +28,47 @@ pub fn client_builder(settings: &Settings) -> ClientBuilder { .connect_timeout(REQWEST_TIMEOUT) } -fn html_to_site_metadata(html_bytes: &[u8], url: &Url) -> Result { +/// Fetches metadata for the given link and optionally generates thumbnail. +#[tracing::instrument(skip_all)] +pub async fn fetch_link_metadata( + url: &Url, + generate_thumbnail: bool, + context: &LemmyContext, +) -> Result { + info!("Fetching site metadata for url: {}", url); + let response = context.client().get(url.as_str()).send().await?; + + let content_type: Option = response + .headers() + .get(CONTENT_TYPE) + .and_then(|h| h.to_str().ok()) + .and_then(|h| h.parse().ok()); + let is_image = content_type.as_ref().unwrap_or(&mime::TEXT_PLAIN).type_() == mime::IMAGE; + + // Can't use .text() here, because it only checks the content header, not the actual bytes + // https://github.com/LemmyNet/lemmy/issues/1964 + let html_bytes = response.bytes().await.map_err(LemmyError::from)?.to_vec(); + + let mut metadata = extract_opengraph_data(&html_bytes, url)?; + + metadata.content_type = content_type.map(|c| c.to_string()); + if generate_thumbnail && is_image { + let image_url = metadata + .image + .as_ref() + .map(lemmy_db_schema::newtypes::DbUrl::inner) + .unwrap_or(url); + metadata.thumbnail = generate_pictrs_thumbnail(image_url, context) + .await + .ok() + .map(Into::into); + } + + Ok(metadata) +} + +/// Extract site metadata from HTML Opengraph attributes. +fn extract_opengraph_data(html_bytes: &[u8], url: &Url) -> Result { let html = String::from_utf8_lossy(html_bytes); // Make sure the first line is doctype html @@ -129,11 +124,13 @@ fn html_to_site_metadata(html_bytes: &[u8], url: &Url) -> Result, - url: &Url, - context: &LemmyContext, -) -> Result { - let pictrs_res = match metadata_option { - Some(metadata_res) => match &metadata_res.image { - // Metadata, with image - // Try to generate a small thumbnail if there's a full sized one from post-links - Some(metadata_image) => fetch_pictrs(metadata_image, &context).await, - // Metadata, but no image - None => fetch_pictrs(url, &context).await, - }, - // No metadata, try to fetch the URL as an image - None => fetch_pictrs(url, &context).await, - }?; - - Url::parse(&format!( - "{}/pictrs/image/{}", - context.settings().get_protocol_and_hostname(), - pictrs_res.files.first().expect("missing pictrs file").file - )) - .map(Into::into) - .map_err(Into::into) -} - +/// Retrieves the image with local pict-rs and generates a thumbnail. Returns the thumbnail url. #[tracing::instrument(skip_all)] -async fn fetch_pictrs( +async fn generate_pictrs_thumbnail( image_url: &Url, context: &LemmyContext, -) -> Result { +) -> Result { let pictrs_config = context.settings().pictrs_config()?; - is_image_content_type(context.client(), image_url).await?; if pictrs_config.cache_remote_thumbnails { // fetch remote non-pictrs images for persistent thumbnail link + // TODO: should limit size once supported by pictrs let fetch_url = format!( "{}image/download?url={}", pictrs_config.url, @@ -263,18 +235,25 @@ async fn fetch_pictrs( .send() .await?; - let response: PictrsResponse = response.json().await.map_err(LemmyError::from)?; + let response: PictrsResponse = response.json().await?; if response.msg == "ok" { - Ok(response) + let thumbnail_url = Url::parse(&format!( + "{}/pictrs/image/{}", + context.settings().get_protocol_and_hostname(), + response.files.first().expect("missing pictrs file").file + ))?; + Ok(thumbnail_url) } else { Err(LemmyErrorType::PictrsResponseError(response.msg))? } } else { - Err(LemmyErrorType::PictrsCachingDisabled)? + // return the original image as "thumbnail" + Ok(image_url.clone()) } } +// TODO: get rid of this #[tracing::instrument(skip_all)] async fn is_image_content_type(client: &ClientWithMiddleware, url: &Url) -> Result<(), LemmyError> { let response = client.get(url.as_str()).send().await?; @@ -296,32 +275,39 @@ mod tests { #![allow(clippy::unwrap_used)] #![allow(clippy::indexing_slicing)] - use crate::request::{client_builder, fetch_site_metadata, html_to_site_metadata, SiteMetadata}; - use lemmy_utils::settings::SETTINGS; + use crate::{ + context::LemmyContext, + request::{extract_opengraph_data, fetch_link_metadata}, + }; use url::Url; // These helped with testing #[tokio::test] - async fn test_site_metadata() { - let settings = &SETTINGS.clone(); - let client = client_builder(settings).build().unwrap().into(); + async fn test_link_metadata() { + let context = LemmyContext::init_test_context().await; let sample_url = Url::parse("https://gitlab.com/IzzyOnDroid/repo/-/wikis/FAQ").unwrap(); - let sample_res = fetch_site_metadata(&client, &sample_url).await.unwrap(); + let sample_res = fetch_link_metadata(&sample_url, false, &context) + .await + .unwrap(); assert_eq!( - SiteMetadata { - title: Some("FAQ · Wiki · IzzyOnDroid / repo · GitLab".to_string()), - description: Some( - "The F-Droid compatible repo at https://apt.izzysoft.de/fdroid/".to_string() - ), - image: Some( - Url::parse("https://gitlab.com/uploads/-/system/project/avatar/4877469/iod_logo.png") - .unwrap() - .into() - ), - embed_video_url: None, - }, - sample_res + Some("FAQ · Wiki · IzzyOnDroid / repo · GitLab".to_string()), + sample_res.title ); + assert_eq!( + Some("The F-Droid compatible repo at https://apt.izzysoft.de/fdroid/".to_string()), + sample_res.description + ); + assert_eq!( + Some( + Url::parse("https://gitlab.com/uploads/-/system/project/avatar/4877469/iod_logo.png") + .unwrap() + .into() + ), + sample_res.image + ); + assert_eq!(None, sample_res.embed_video_url); + assert_eq!(None, sample_res.content_type); + assert_eq!(None, sample_res.thumbnail); } // #[test] @@ -339,7 +325,7 @@ mod tests { // root relative url let html_bytes = b""; - let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata"); + let metadata = extract_opengraph_data(html_bytes, &url).expect("Unable to parse metadata"); assert_eq!( metadata.image, Some(Url::parse("https://example.com/image.jpg").unwrap().into()) @@ -347,7 +333,7 @@ mod tests { // base relative url let html_bytes = b""; - let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata"); + let metadata = extract_opengraph_data(html_bytes, &url).expect("Unable to parse metadata"); assert_eq!( metadata.image, Some( @@ -359,7 +345,7 @@ mod tests { // absolute url let html_bytes = b""; - let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata"); + let metadata = extract_opengraph_data(html_bytes, &url).expect("Unable to parse metadata"); assert_eq!( metadata.image, Some(Url::parse("https://cdn.host.com/image.jpg").unwrap().into()) @@ -367,7 +353,7 @@ mod tests { // protocol relative url let html_bytes = b""; - let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata"); + let metadata = extract_opengraph_data(html_bytes, &url).expect("Unable to parse metadata"); assert_eq!( metadata.image, Some(Url::parse("https://example.com/image.jpg").unwrap().into()) diff --git a/crates/api_crud/src/post/create.rs b/crates/api_crud/src/post/create.rs index 3537738ef..211e3b55d 100644 --- a/crates/api_crud/src/post/create.rs +++ b/crates/api_crud/src/post/create.rs @@ -4,7 +4,7 @@ use lemmy_api_common::{ build_response::build_post_response, context::LemmyContext, post::{CreatePost, PostResponse}, - request::fetch_site_data, + request::fetch_link_metadata, send_activity::{ActivityChannel, SendActivityData}, utils::{ check_community_user_action, @@ -83,10 +83,12 @@ pub async fn create_post( } // Fetch post links and pictrs cached image - let (metadata_res, thumbnail_url) = fetch_site_data(data_url, true, &context).await; - let (embed_title, embed_description, embed_video_url) = metadata_res - .map(|u| (u.title, u.description, u.embed_video_url)) - .unwrap_or_default(); + let metadata = match data_url { + Some(url) => fetch_link_metadata(url, true, &context) + .await + .unwrap_or_default(), + _ => Default::default(), + }; // Only need to check if language is allowed in case user set it explicitly. When using default // language, it already only returns allowed languages. @@ -117,11 +119,11 @@ pub async fn create_post( .community_id(data.community_id) .creator_id(local_user_view.person.id) .nsfw(data.nsfw) - .embed_title(embed_title) - .embed_description(embed_description) - .embed_video_url(embed_video_url) + .embed_title(metadata.title) + .embed_description(metadata.description) + .embed_video_url(metadata.embed_video_url) .language_id(language_id) - .thumbnail_url(thumbnail_url) + .thumbnail_url(metadata.thumbnail) .build(); let inserted_post = Post::create(&mut context.pool(), &post_form) diff --git a/crates/api_crud/src/post/update.rs b/crates/api_crud/src/post/update.rs index 1c47cb3f4..b0d1c6c43 100644 --- a/crates/api_crud/src/post/update.rs +++ b/crates/api_crud/src/post/update.rs @@ -4,7 +4,7 @@ use lemmy_api_common::{ build_response::build_post_response, context::LemmyContext, post::{EditPost, PostResponse}, - request::fetch_site_data, + request::fetch_link_metadata, send_activity::{ActivityChannel, SendActivityData}, utils::{check_community_user_action, local_site_to_slur_regex, process_markdown_opt}, }; @@ -68,11 +68,12 @@ pub async fn update_post( } // Fetch post links and Pictrs cached image - let data_url = data.url.as_ref(); - let (metadata_res, thumbnail_url) = fetch_site_data(data_url, true, &context).await; - let (embed_title, embed_description, embed_video_url) = metadata_res - .map(|u| (Some(u.title), Some(u.description), Some(u.embed_video_url))) - .unwrap_or_default(); + let metadata = match data_url { + Some(url) => fetch_link_metadata(url, true, &context) + .await + .unwrap_or_default(), + _ => Default::default(), + }; let language_id = data.language_id; CommunityLanguage::is_allowed_community_language( @@ -87,11 +88,11 @@ pub async fn update_post( url, body: diesel_option_overwrite(body), nsfw: data.nsfw, - embed_title, - embed_description, - embed_video_url, + embed_title: Some(metadata.title), + embed_description: Some(metadata.description), + embed_video_url: Some(metadata.embed_video_url), language_id: data.language_id, - thumbnail_url: Some(thumbnail_url), + thumbnail_url: Some(metadata.thumbnail), updated: Some(Some(naive_now())), ..Default::default() }; diff --git a/crates/apub/src/objects/post.rs b/crates/apub/src/objects/post.rs index 674674681..223103687 100644 --- a/crates/apub/src/objects/post.rs +++ b/crates/apub/src/objects/post.rs @@ -24,7 +24,7 @@ use chrono::{DateTime, Utc}; use html2text::{from_read_with_decorator, render::text_renderer::TrivialDecorator}; use lemmy_api_common::{ context::LemmyContext, - request::fetch_site_data, + request::fetch_link_metadata, utils::{ is_mod_or_admin, local_site_opt_to_sensitive, @@ -216,16 +216,12 @@ impl Object for ApubPost { // Only fetch metadata if the post has a url and was not seen previously. We dont want to // waste resources by fetching metadata for the same post multiple times. // Additionally, only fetch image if content is not sensitive or is allowed on local site. - let (metadata_res, thumbnail) = match &url { - Some(url) if old_post.is_err() => fetch_site_data(Some(url), include_image, &context).await, - _ => (None, None), + let metadata = match &url { + Some(url) => fetch_link_metadata(url, include_image, context) + .await + .unwrap_or_default(), + _ => Default::default(), }; - // If no image was included with metadata, use post image instead when available. - let thumbnail_url = thumbnail.or_else(|| page.image.map(|i| i.url.into())); - - let (embed_title, embed_description, embed_video_url) = metadata_res - .map(|u| (u.title, u.description, u.embed_video_url)) - .unwrap_or_default(); let slur_regex = &local_site_opt_to_slur_regex(&local_site); let body = read_from_string_or_source_opt(&page.content, &page.media_type, &page.source); @@ -245,10 +241,10 @@ impl Object for ApubPost { updated: page.updated.map(Into::into), deleted: Some(false), nsfw: page.sensitive, - embed_title, - embed_description, - embed_video_url, - thumbnail_url, + embed_title: metadata.title, + embed_description: metadata.description, + embed_video_url: metadata.embed_video_url, + thumbnail_url: metadata.thumbnail, ap_id: Some(page.id.clone().into()), local: Some(false), language_id,