more cleanup (fixes #2611)

2023-10-27 16:11:12 +02:00 · 2023-10-27 16:11:12 +02:00 · c8c355dbe2
parent bf205399e5
commit c8c355dbe2
8 changed files with 134 additions and 144 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -2591,6 +2591,7 @@ dependencies = [
 "lemmy_db_views_actor",
 "lemmy_db_views_moderator",
 "lemmy_utils",
+ "mime",
 "once_cell",
 "regex",
 "reqwest",
--- a/crates/api/src/post/get_link_metadata.rs
+++ b/crates/api/src/post/get_link_metadata.rs
@ -2,7 +2,7 @@ use actix_web::web::{Data, Json};
 use lemmy_api_common::{
  context::LemmyContext,
  post::{GetSiteMetadata, GetSiteMetadataResponse},
-  request::fetch_site_metadata,
+  request::fetch_link_metadata,
 };
 use lemmy_utils::error::LemmyError;

@ -11,7 +11,7 @@ pub async fn get_link_metadata(
  data: Json<GetSiteMetadata>,
  context: Data<LemmyContext>,
 ) -> Result<Json<GetSiteMetadataResponse>, LemmyError> {
-  let metadata = fetch_site_metadata(context.client(), &data.url).await?;
+  let metadata = fetch_link_metadata(&data.url, false, &context).await?;

  Ok(Json(GetSiteMetadataResponse { metadata }))
 }
--- a/crates/api_common/Cargo.toml
+++ b/crates/api_common/Cargo.toml
@ -70,6 +70,7 @@ jsonwebtoken = { version = "8.3.0", optional = true }
 # necessary for wasmt compilation
 getrandom = { version = "0.2.10", features = ["js"] }
 task-local-extensions = "0.1.4"
+mime = "0.3.17"

 [dev-dependencies]
 serial_test = { workspace = true }
--- a/crates/api_common/src/post.rs
+++ b/crates/api_common/src/post.rs
@ -238,17 +238,20 @@ pub struct GetSiteMetadata {
 #[cfg_attr(feature = "full", ts(export))]
 /// The site metadata response.
 pub struct GetSiteMetadataResponse {
-  pub metadata: SiteMetadata,
+  pub metadata: LinkMetadata,
 }

 #[skip_serializing_none]
-#[derive(Debug, Deserialize, Serialize, PartialEq, Eq, Clone)]
+#[derive(Debug, Deserialize, Serialize, PartialEq, Eq, Clone, Default)]
 #[cfg_attr(feature = "full", derive(TS))]
 #[cfg_attr(feature = "full", ts(export))]
 /// Site metadata, from its opengraph tags.
-pub struct SiteMetadata {
+pub struct LinkMetadata {
  pub title: Option<String>,
  pub description: Option<String>,
  pub(crate) image: Option<DbUrl>,
  pub embed_video_url: Option<DbUrl>,
+  pub content_type: Option<String>,
+  #[serde(skip)]
+  pub thumbnail: Option<DbUrl>,
 }
--- a/crates/api_common/src/request.rs
+++ b/crates/api_common/src/request.rs
@ -1,65 +1,20 @@
-use crate::{context::LemmyContext, post::SiteMetadata};
+use crate::{context::LemmyContext, post::LinkMetadata};
 use encoding::{all::encodings, DecoderTrap};
-use lemmy_db_schema::newtypes::DbUrl;
 use lemmy_utils::{
  error::{LemmyError, LemmyErrorType},
  settings::structs::Settings,
  version::VERSION,
  REQWEST_TIMEOUT,
 };
-use reqwest::{Client, ClientBuilder};
+use mime::Mime;
+use reqwest::{header::CONTENT_TYPE, Client, ClientBuilder};
 use reqwest_middleware::ClientWithMiddleware;
-use serde::{de::DeserializeOwned, Deserialize, Deserializer};
+use serde::Deserialize;
 use tracing::info;
 use url::Url;
 use urlencoding::encode;
 use webpage::HTML;

-/// Both are options, since the URL might be either an html page, or an image
-/// Returns the SiteMetadata, and an image URL, if there is a picture associated
-#[tracing::instrument(skip_all)]
-pub async fn fetch_site_data(
-  url: Option<&Url>,
-  include_image: bool,
-  context: &LemmyContext,
-) -> (Option<SiteMetadata>, Option<DbUrl>) {
-  match &url {
-    Some(url) => {
-      // Fetch metadata
-      // Ignore errors, since it may be an image, or not have the data.
-      // Warning, this may ignore SSL errors
-      let metadata_option = fetch_site_metadata(context.client(), url).await.ok();
-      if !include_image {
-        (metadata_option, None)
-      } else {
-        let thumbnail_url = fetch_pictrs_url_from_site_metadata(&metadata_option, url, &context)
-          .await
-          .ok();
-        (metadata_option, thumbnail_url)
-      }
-    }
-    None => (None, None),
-  }
-}
-
-/// Fetches the post link html tags (like title, description, image, etc)
-#[tracing::instrument(skip_all)]
-pub async fn fetch_site_metadata(
-  client: &ClientWithMiddleware,
-  url: &Url,
-) -> Result<SiteMetadata, LemmyError> {
-  info!("Fetching site metadata for url: {}", url);
-  let response = client.get(url.as_str()).send().await?;
-
-  // Can't use .text() here, because it only checks the content header, not the actual bytes
-  // https://github.com/LemmyNet/lemmy/issues/1964
-  let html_bytes = response.bytes().await.map_err(LemmyError::from)?.to_vec();
-
-  let tags = html_to_site_metadata(&html_bytes, url)?;
-
-  Ok(tags)
-}
-
 pub fn client_builder(settings: &Settings) -> ClientBuilder {
  let user_agent = format!(
    "Lemmy/{}; +{}",
@ -73,7 +28,47 @@ pub fn client_builder(settings: &Settings) -> ClientBuilder {
    .connect_timeout(REQWEST_TIMEOUT)
 }

-fn html_to_site_metadata(html_bytes: &[u8], url: &Url) -> Result<SiteMetadata, LemmyError> {
+/// Fetches metadata for the given link and optionally generates thumbnail.
+#[tracing::instrument(skip_all)]
+pub async fn fetch_link_metadata(
+  url: &Url,
+  generate_thumbnail: bool,
+  context: &LemmyContext,
+) -> Result<LinkMetadata, LemmyError> {
+  info!("Fetching site metadata for url: {}", url);
+  let response = context.client().get(url.as_str()).send().await?;
+
+  let content_type: Option<Mime> = response
+    .headers()
+    .get(CONTENT_TYPE)
+    .and_then(|h| h.to_str().ok())
+    .and_then(|h| h.parse().ok());
+  let is_image = content_type.as_ref().unwrap_or(&mime::TEXT_PLAIN).type_() == mime::IMAGE;
+
+  // Can't use .text() here, because it only checks the content header, not the actual bytes
+  // https://github.com/LemmyNet/lemmy/issues/1964
+  let html_bytes = response.bytes().await.map_err(LemmyError::from)?.to_vec();
+
+  let mut metadata = extract_opengraph_data(&html_bytes, url)?;
+
+  metadata.content_type = content_type.map(|c| c.to_string());
+  if generate_thumbnail && is_image {
+    let image_url = metadata
+      .image
+      .as_ref()
+      .map(lemmy_db_schema::newtypes::DbUrl::inner)
+      .unwrap_or(url);
+    metadata.thumbnail = generate_pictrs_thumbnail(image_url, context)
+      .await
+      .ok()
+      .map(Into::into);
+  }
+
+  Ok(metadata)
+}
+
+/// Extract site metadata from HTML Opengraph attributes.
+fn extract_opengraph_data(html_bytes: &[u8], url: &Url) -> Result<LinkMetadata, LemmyError> {
  let html = String::from_utf8_lossy(html_bytes);

  // Make sure the first line is doctype html
@ -129,11 +124,13 @@ fn html_to_site_metadata(html_bytes: &[u8], url: &Url) -> Result<SiteMetadata, L
    // join also works if the target URL is absolute
    .and_then(|v| url.join(&v.url).ok());

-  Ok(SiteMetadata {
+  Ok(LinkMetadata {
    title: og_title.or(page_title),
    description: og_description.or(page_description),
    image: og_image.map(Into::into),
    embed_video_url: og_embed_url.map(Into::into),
+    content_type: None,
+    thumbnail: None,
  })
 }

@ -214,42 +211,17 @@ pub async fn delete_image_from_pictrs(
  Ok(())
 }

-async fn fetch_pictrs_url_from_site_metadata(
-  metadata_option: &Option<SiteMetadata>,
-  url: &Url,
-  context: &LemmyContext,
-) -> Result<DbUrl, LemmyError> {
-  let pictrs_res = match metadata_option {
-    Some(metadata_res) => match &metadata_res.image {
-      // Metadata, with image
-      // Try to generate a small thumbnail if there's a full sized one from post-links
-      Some(metadata_image) => fetch_pictrs(metadata_image, &context).await,
-      // Metadata, but no image
-      None => fetch_pictrs(url, &context).await,
-    },
-    // No metadata, try to fetch the URL as an image
-    None => fetch_pictrs(url, &context).await,
-  }?;
-
-  Url::parse(&format!(
-    "{}/pictrs/image/{}",
-    context.settings().get_protocol_and_hostname(),
-    pictrs_res.files.first().expect("missing pictrs file").file
-  ))
-  .map(Into::into)
-  .map_err(Into::into)
-}
-
+/// Retrieves the image with local pict-rs and generates a thumbnail. Returns the thumbnail url.
 #[tracing::instrument(skip_all)]
-async fn fetch_pictrs(
+async fn generate_pictrs_thumbnail(
  image_url: &Url,
  context: &LemmyContext,
-) -> Result<PictrsResponse, LemmyError> {
+) -> Result<Url, LemmyError> {
  let pictrs_config = context.settings().pictrs_config()?;
-  is_image_content_type(context.client(), image_url).await?;

  if pictrs_config.cache_remote_thumbnails {
    // fetch remote non-pictrs images for persistent thumbnail link
+    // TODO: should limit size once supported by pictrs
    let fetch_url = format!(
      "{}image/download?url={}",
      pictrs_config.url,
@ -263,18 +235,25 @@ async fn fetch_pictrs(
      .send()
      .await?;

-    let response: PictrsResponse = response.json().await.map_err(LemmyError::from)?;
+    let response: PictrsResponse = response.json().await?;

    if response.msg == "ok" {
-      Ok(response)
+      let thumbnail_url = Url::parse(&format!(
+        "{}/pictrs/image/{}",
+        context.settings().get_protocol_and_hostname(),
+        response.files.first().expect("missing pictrs file").file
+      ))?;
+      Ok(thumbnail_url)
    } else {
      Err(LemmyErrorType::PictrsResponseError(response.msg))?
    }
  } else {
-    Err(LemmyErrorType::PictrsCachingDisabled)?
+    // return the original image as "thumbnail"
+    Ok(image_url.clone())
  }
 }

+// TODO: get rid of this
 #[tracing::instrument(skip_all)]
 async fn is_image_content_type(client: &ClientWithMiddleware, url: &Url) -> Result<(), LemmyError> {
  let response = client.get(url.as_str()).send().await?;
@ -296,32 +275,39 @@ mod tests {
  #![allow(clippy::unwrap_used)]
  #![allow(clippy::indexing_slicing)]

-  use crate::request::{client_builder, fetch_site_metadata, html_to_site_metadata, SiteMetadata};
-  use lemmy_utils::settings::SETTINGS;
+  use crate::{
+    context::LemmyContext,
+    request::{extract_opengraph_data, fetch_link_metadata},
+  };
  use url::Url;

  // These helped with testing
  #[tokio::test]
-  async fn test_site_metadata() {
-    let settings = &SETTINGS.clone();
-    let client = client_builder(settings).build().unwrap().into();
+  async fn test_link_metadata() {
+    let context = LemmyContext::init_test_context().await;
    let sample_url = Url::parse("https://gitlab.com/IzzyOnDroid/repo/-/wikis/FAQ").unwrap();
-    let sample_res = fetch_site_metadata(&client, &sample_url).await.unwrap();
+    let sample_res = fetch_link_metadata(&sample_url, false, &context)
+      .await
+      .unwrap();
    assert_eq!(
-      SiteMetadata {
-        title: Some("FAQ · Wiki · IzzyOnDroid / repo · GitLab".to_string()),
-        description: Some(
-          "The F-Droid compatible repo at https://apt.izzysoft.de/fdroid/".to_string()
-        ),
-        image: Some(
-          Url::parse("https://gitlab.com/uploads/-/system/project/avatar/4877469/iod_logo.png")
-            .unwrap()
-            .into()
-        ),
-        embed_video_url: None,
-      },
-      sample_res
+      Some("FAQ · Wiki · IzzyOnDroid / repo · GitLab".to_string()),
+      sample_res.title
    );
+    assert_eq!(
+      Some("The F-Droid compatible repo at https://apt.izzysoft.de/fdroid/".to_string()),
+      sample_res.description
+    );
+    assert_eq!(
+      Some(
+        Url::parse("https://gitlab.com/uploads/-/system/project/avatar/4877469/iod_logo.png")
+          .unwrap()
+          .into()
+      ),
+      sample_res.image
+    );
+    assert_eq!(None, sample_res.embed_video_url);
+    assert_eq!(None, sample_res.content_type);
+    assert_eq!(None, sample_res.thumbnail);
  }

  // #[test]
@ -339,7 +325,7 @@ mod tests {

    // root relative url
    let html_bytes = b"<!DOCTYPE html><html><head><meta property='og:image' content='/image.jpg'></head><body></body></html>";
-    let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata");
+    let metadata = extract_opengraph_data(html_bytes, &url).expect("Unable to parse metadata");
    assert_eq!(
      metadata.image,
      Some(Url::parse("https://example.com/image.jpg").unwrap().into())
@ -347,7 +333,7 @@ mod tests {

    // base relative url
    let html_bytes = b"<!DOCTYPE html><html><head><meta property='og:image' content='image.jpg'></head><body></body></html>";
-    let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata");
+    let metadata = extract_opengraph_data(html_bytes, &url).expect("Unable to parse metadata");
    assert_eq!(
      metadata.image,
      Some(
@ -359,7 +345,7 @@ mod tests {

    // absolute url
    let html_bytes = b"<!DOCTYPE html><html><head><meta property='og:image' content='https://cdn.host.com/image.jpg'></head><body></body></html>";
-    let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata");
+    let metadata = extract_opengraph_data(html_bytes, &url).expect("Unable to parse metadata");
    assert_eq!(
      metadata.image,
      Some(Url::parse("https://cdn.host.com/image.jpg").unwrap().into())
@ -367,7 +353,7 @@ mod tests {

    // protocol relative url
    let html_bytes = b"<!DOCTYPE html><html><head><meta property='og:image' content='//example.com/image.jpg'></head><body></body></html>";
-    let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata");
+    let metadata = extract_opengraph_data(html_bytes, &url).expect("Unable to parse metadata");
    assert_eq!(
      metadata.image,
      Some(Url::parse("https://example.com/image.jpg").unwrap().into())
--- a/crates/api_crud/src/post/create.rs
+++ b/crates/api_crud/src/post/create.rs
@ -4,7 +4,7 @@ use lemmy_api_common::{
  build_response::build_post_response,
  context::LemmyContext,
  post::{CreatePost, PostResponse},
-  request::fetch_site_data,
+  request::fetch_link_metadata,
  send_activity::{ActivityChannel, SendActivityData},
  utils::{
    check_community_user_action,
@ -83,10 +83,12 @@ pub async fn create_post(
  }

  // Fetch post links and pictrs cached image
-  let (metadata_res, thumbnail_url) = fetch_site_data(data_url, true, &context).await;
-  let (embed_title, embed_description, embed_video_url) = metadata_res
-    .map(|u| (u.title, u.description, u.embed_video_url))
-    .unwrap_or_default();
+  let metadata = match data_url {
+    Some(url) => fetch_link_metadata(url, true, &context)
+      .await
+      .unwrap_or_default(),
+    _ => Default::default(),
+  };

  // Only need to check if language is allowed in case user set it explicitly. When using default
  // language, it already only returns allowed languages.
@ -117,11 +119,11 @@ pub async fn create_post(
    .community_id(data.community_id)
    .creator_id(local_user_view.person.id)
    .nsfw(data.nsfw)
-    .embed_title(embed_title)
-    .embed_description(embed_description)
-    .embed_video_url(embed_video_url)
+    .embed_title(metadata.title)
+    .embed_description(metadata.description)
+    .embed_video_url(metadata.embed_video_url)
    .language_id(language_id)
-    .thumbnail_url(thumbnail_url)
+    .thumbnail_url(metadata.thumbnail)
    .build();

  let inserted_post = Post::create(&mut context.pool(), &post_form)
--- a/crates/api_crud/src/post/update.rs
+++ b/crates/api_crud/src/post/update.rs
@ -4,7 +4,7 @@ use lemmy_api_common::{
  build_response::build_post_response,
  context::LemmyContext,
  post::{EditPost, PostResponse},
-  request::fetch_site_data,
+  request::fetch_link_metadata,
  send_activity::{ActivityChannel, SendActivityData},
  utils::{check_community_user_action, local_site_to_slur_regex, process_markdown_opt},
 };
@ -68,11 +68,12 @@ pub async fn update_post(
  }

  // Fetch post links and Pictrs cached image
-  let data_url = data.url.as_ref();
-  let (metadata_res, thumbnail_url) = fetch_site_data(data_url, true, &context).await;
-  let (embed_title, embed_description, embed_video_url) = metadata_res
-    .map(|u| (Some(u.title), Some(u.description), Some(u.embed_video_url)))
-    .unwrap_or_default();
+  let metadata = match data_url {
+    Some(url) => fetch_link_metadata(url, true, &context)
+      .await
+      .unwrap_or_default(),
+    _ => Default::default(),
+  };

  let language_id = data.language_id;
  CommunityLanguage::is_allowed_community_language(
@ -87,11 +88,11 @@ pub async fn update_post(
    url,
    body: diesel_option_overwrite(body),
    nsfw: data.nsfw,
-    embed_title,
-    embed_description,
-    embed_video_url,
+    embed_title: Some(metadata.title),
+    embed_description: Some(metadata.description),
+    embed_video_url: Some(metadata.embed_video_url),
    language_id: data.language_id,
-    thumbnail_url: Some(thumbnail_url),
+    thumbnail_url: Some(metadata.thumbnail),
    updated: Some(Some(naive_now())),
    ..Default::default()
  };
--- a/crates/apub/src/objects/post.rs
+++ b/crates/apub/src/objects/post.rs
@ -24,7 +24,7 @@ use chrono::{DateTime, Utc};
 use html2text::{from_read_with_decorator, render::text_renderer::TrivialDecorator};
 use lemmy_api_common::{
  context::LemmyContext,
-  request::fetch_site_data,
+  request::fetch_link_metadata,
  utils::{
    is_mod_or_admin,
    local_site_opt_to_sensitive,
@ -216,16 +216,12 @@ impl Object for ApubPost {
      // Only fetch metadata if the post has a url and was not seen previously. We dont want to
      // waste resources by fetching metadata for the same post multiple times.
      // Additionally, only fetch image if content is not sensitive or is allowed on local site.
-      let (metadata_res, thumbnail) = match &url {
-        Some(url) if old_post.is_err() => fetch_site_data(Some(url), include_image, &context).await,
-        _ => (None, None),
+      let metadata = match &url {
+        Some(url) => fetch_link_metadata(url, include_image, context)
+          .await
+          .unwrap_or_default(),
+        _ => Default::default(),
      };
-      // If no image was included with metadata, use post image instead when available.
-      let thumbnail_url = thumbnail.or_else(|| page.image.map(|i| i.url.into()));
-
-      let (embed_title, embed_description, embed_video_url) = metadata_res
-        .map(|u| (u.title, u.description, u.embed_video_url))
-        .unwrap_or_default();
      let slur_regex = &local_site_opt_to_slur_regex(&local_site);

      let body = read_from_string_or_source_opt(&page.content, &page.media_type, &page.source);
@ -245,10 +241,10 @@ impl Object for ApubPost {
        updated: page.updated.map(Into::into),
        deleted: Some(false),
        nsfw: page.sensitive,
-        embed_title,
-        embed_description,
-        embed_video_url,
-        thumbnail_url,
+        embed_title: metadata.title,
+        embed_description: metadata.description,
+        embed_video_url: metadata.embed_video_url,
+        thumbnail_url: metadata.thumbnail,
        ap_id: Some(page.id.clone().into()),
        local: Some(false),
        language_id,