more cleanup (fixes #2611)

cleanup-request-rs
Felix Ableitner 2023-10-27 16:11:12 +02:00
parent bf205399e5
commit c8c355dbe2
8 changed files with 134 additions and 144 deletions

1
Cargo.lock generated
View File

@ -2591,6 +2591,7 @@ dependencies = [
"lemmy_db_views_actor",
"lemmy_db_views_moderator",
"lemmy_utils",
"mime",
"once_cell",
"regex",
"reqwest",

View File

@ -2,7 +2,7 @@ use actix_web::web::{Data, Json};
use lemmy_api_common::{
context::LemmyContext,
post::{GetSiteMetadata, GetSiteMetadataResponse},
request::fetch_site_metadata,
request::fetch_link_metadata,
};
use lemmy_utils::error::LemmyError;
@ -11,7 +11,7 @@ pub async fn get_link_metadata(
data: Json<GetSiteMetadata>,
context: Data<LemmyContext>,
) -> Result<Json<GetSiteMetadataResponse>, LemmyError> {
let metadata = fetch_site_metadata(context.client(), &data.url).await?;
let metadata = fetch_link_metadata(&data.url, false, &context).await?;
Ok(Json(GetSiteMetadataResponse { metadata }))
}

View File

@ -70,6 +70,7 @@ jsonwebtoken = { version = "8.3.0", optional = true }
# necessary for wasmt compilation
getrandom = { version = "0.2.10", features = ["js"] }
task-local-extensions = "0.1.4"
mime = "0.3.17"
[dev-dependencies]
serial_test = { workspace = true }

View File

@ -238,17 +238,20 @@ pub struct GetSiteMetadata {
#[cfg_attr(feature = "full", ts(export))]
/// The site metadata response.
pub struct GetSiteMetadataResponse {
pub metadata: SiteMetadata,
pub metadata: LinkMetadata,
}
#[skip_serializing_none]
#[derive(Debug, Deserialize, Serialize, PartialEq, Eq, Clone)]
#[derive(Debug, Deserialize, Serialize, PartialEq, Eq, Clone, Default)]
#[cfg_attr(feature = "full", derive(TS))]
#[cfg_attr(feature = "full", ts(export))]
/// Site metadata, from its opengraph tags.
pub struct SiteMetadata {
pub struct LinkMetadata {
pub title: Option<String>,
pub description: Option<String>,
pub(crate) image: Option<DbUrl>,
pub embed_video_url: Option<DbUrl>,
pub content_type: Option<String>,
#[serde(skip)]
pub thumbnail: Option<DbUrl>,
}

View File

@ -1,65 +1,20 @@
use crate::{context::LemmyContext, post::SiteMetadata};
use crate::{context::LemmyContext, post::LinkMetadata};
use encoding::{all::encodings, DecoderTrap};
use lemmy_db_schema::newtypes::DbUrl;
use lemmy_utils::{
error::{LemmyError, LemmyErrorType},
settings::structs::Settings,
version::VERSION,
REQWEST_TIMEOUT,
};
use reqwest::{Client, ClientBuilder};
use mime::Mime;
use reqwest::{header::CONTENT_TYPE, Client, ClientBuilder};
use reqwest_middleware::ClientWithMiddleware;
use serde::{de::DeserializeOwned, Deserialize, Deserializer};
use serde::Deserialize;
use tracing::info;
use url::Url;
use urlencoding::encode;
use webpage::HTML;
/// Both are options, since the URL might be either an html page, or an image
/// Returns the SiteMetadata, and an image URL, if there is a picture associated
#[tracing::instrument(skip_all)]
pub async fn fetch_site_data(
url: Option<&Url>,
include_image: bool,
context: &LemmyContext,
) -> (Option<SiteMetadata>, Option<DbUrl>) {
match &url {
Some(url) => {
// Fetch metadata
// Ignore errors, since it may be an image, or not have the data.
// Warning, this may ignore SSL errors
let metadata_option = fetch_site_metadata(context.client(), url).await.ok();
if !include_image {
(metadata_option, None)
} else {
let thumbnail_url = fetch_pictrs_url_from_site_metadata(&metadata_option, url, &context)
.await
.ok();
(metadata_option, thumbnail_url)
}
}
None => (None, None),
}
}
/// Fetches the post link html tags (like title, description, image, etc)
#[tracing::instrument(skip_all)]
pub async fn fetch_site_metadata(
client: &ClientWithMiddleware,
url: &Url,
) -> Result<SiteMetadata, LemmyError> {
info!("Fetching site metadata for url: {}", url);
let response = client.get(url.as_str()).send().await?;
// Can't use .text() here, because it only checks the content header, not the actual bytes
// https://github.com/LemmyNet/lemmy/issues/1964
let html_bytes = response.bytes().await.map_err(LemmyError::from)?.to_vec();
let tags = html_to_site_metadata(&html_bytes, url)?;
Ok(tags)
}
pub fn client_builder(settings: &Settings) -> ClientBuilder {
let user_agent = format!(
"Lemmy/{}; +{}",
@ -73,7 +28,47 @@ pub fn client_builder(settings: &Settings) -> ClientBuilder {
.connect_timeout(REQWEST_TIMEOUT)
}
fn html_to_site_metadata(html_bytes: &[u8], url: &Url) -> Result<SiteMetadata, LemmyError> {
/// Fetches metadata for the given link and optionally generates thumbnail.
#[tracing::instrument(skip_all)]
pub async fn fetch_link_metadata(
url: &Url,
generate_thumbnail: bool,
context: &LemmyContext,
) -> Result<LinkMetadata, LemmyError> {
info!("Fetching site metadata for url: {}", url);
let response = context.client().get(url.as_str()).send().await?;
let content_type: Option<Mime> = response
.headers()
.get(CONTENT_TYPE)
.and_then(|h| h.to_str().ok())
.and_then(|h| h.parse().ok());
let is_image = content_type.as_ref().unwrap_or(&mime::TEXT_PLAIN).type_() == mime::IMAGE;
// Can't use .text() here, because it only checks the content header, not the actual bytes
// https://github.com/LemmyNet/lemmy/issues/1964
let html_bytes = response.bytes().await.map_err(LemmyError::from)?.to_vec();
let mut metadata = extract_opengraph_data(&html_bytes, url)?;
metadata.content_type = content_type.map(|c| c.to_string());
if generate_thumbnail && is_image {
let image_url = metadata
.image
.as_ref()
.map(lemmy_db_schema::newtypes::DbUrl::inner)
.unwrap_or(url);
metadata.thumbnail = generate_pictrs_thumbnail(image_url, context)
.await
.ok()
.map(Into::into);
}
Ok(metadata)
}
/// Extract site metadata from HTML Opengraph attributes.
fn extract_opengraph_data(html_bytes: &[u8], url: &Url) -> Result<LinkMetadata, LemmyError> {
let html = String::from_utf8_lossy(html_bytes);
// Make sure the first line is doctype html
@ -129,11 +124,13 @@ fn html_to_site_metadata(html_bytes: &[u8], url: &Url) -> Result<SiteMetadata, L
// join also works if the target URL is absolute
.and_then(|v| url.join(&v.url).ok());
Ok(SiteMetadata {
Ok(LinkMetadata {
title: og_title.or(page_title),
description: og_description.or(page_description),
image: og_image.map(Into::into),
embed_video_url: og_embed_url.map(Into::into),
content_type: None,
thumbnail: None,
})
}
@ -214,42 +211,17 @@ pub async fn delete_image_from_pictrs(
Ok(())
}
async fn fetch_pictrs_url_from_site_metadata(
metadata_option: &Option<SiteMetadata>,
url: &Url,
context: &LemmyContext,
) -> Result<DbUrl, LemmyError> {
let pictrs_res = match metadata_option {
Some(metadata_res) => match &metadata_res.image {
// Metadata, with image
// Try to generate a small thumbnail if there's a full sized one from post-links
Some(metadata_image) => fetch_pictrs(metadata_image, &context).await,
// Metadata, but no image
None => fetch_pictrs(url, &context).await,
},
// No metadata, try to fetch the URL as an image
None => fetch_pictrs(url, &context).await,
}?;
Url::parse(&format!(
"{}/pictrs/image/{}",
context.settings().get_protocol_and_hostname(),
pictrs_res.files.first().expect("missing pictrs file").file
))
.map(Into::into)
.map_err(Into::into)
}
/// Retrieves the image with local pict-rs and generates a thumbnail. Returns the thumbnail url.
#[tracing::instrument(skip_all)]
async fn fetch_pictrs(
async fn generate_pictrs_thumbnail(
image_url: &Url,
context: &LemmyContext,
) -> Result<PictrsResponse, LemmyError> {
) -> Result<Url, LemmyError> {
let pictrs_config = context.settings().pictrs_config()?;
is_image_content_type(context.client(), image_url).await?;
if pictrs_config.cache_remote_thumbnails {
// fetch remote non-pictrs images for persistent thumbnail link
// TODO: should limit size once supported by pictrs
let fetch_url = format!(
"{}image/download?url={}",
pictrs_config.url,
@ -263,18 +235,25 @@ async fn fetch_pictrs(
.send()
.await?;
let response: PictrsResponse = response.json().await.map_err(LemmyError::from)?;
let response: PictrsResponse = response.json().await?;
if response.msg == "ok" {
Ok(response)
let thumbnail_url = Url::parse(&format!(
"{}/pictrs/image/{}",
context.settings().get_protocol_and_hostname(),
response.files.first().expect("missing pictrs file").file
))?;
Ok(thumbnail_url)
} else {
Err(LemmyErrorType::PictrsResponseError(response.msg))?
}
} else {
Err(LemmyErrorType::PictrsCachingDisabled)?
// return the original image as "thumbnail"
Ok(image_url.clone())
}
}
// TODO: get rid of this
#[tracing::instrument(skip_all)]
async fn is_image_content_type(client: &ClientWithMiddleware, url: &Url) -> Result<(), LemmyError> {
let response = client.get(url.as_str()).send().await?;
@ -296,32 +275,39 @@ mod tests {
#![allow(clippy::unwrap_used)]
#![allow(clippy::indexing_slicing)]
use crate::request::{client_builder, fetch_site_metadata, html_to_site_metadata, SiteMetadata};
use lemmy_utils::settings::SETTINGS;
use crate::{
context::LemmyContext,
request::{extract_opengraph_data, fetch_link_metadata},
};
use url::Url;
// These helped with testing
#[tokio::test]
async fn test_site_metadata() {
let settings = &SETTINGS.clone();
let client = client_builder(settings).build().unwrap().into();
async fn test_link_metadata() {
let context = LemmyContext::init_test_context().await;
let sample_url = Url::parse("https://gitlab.com/IzzyOnDroid/repo/-/wikis/FAQ").unwrap();
let sample_res = fetch_site_metadata(&client, &sample_url).await.unwrap();
let sample_res = fetch_link_metadata(&sample_url, false, &context)
.await
.unwrap();
assert_eq!(
SiteMetadata {
title: Some("FAQ · Wiki · IzzyOnDroid / repo · GitLab".to_string()),
description: Some(
"The F-Droid compatible repo at https://apt.izzysoft.de/fdroid/".to_string()
),
image: Some(
Url::parse("https://gitlab.com/uploads/-/system/project/avatar/4877469/iod_logo.png")
.unwrap()
.into()
),
embed_video_url: None,
},
sample_res
Some("FAQ · Wiki · IzzyOnDroid / repo · GitLab".to_string()),
sample_res.title
);
assert_eq!(
Some("The F-Droid compatible repo at https://apt.izzysoft.de/fdroid/".to_string()),
sample_res.description
);
assert_eq!(
Some(
Url::parse("https://gitlab.com/uploads/-/system/project/avatar/4877469/iod_logo.png")
.unwrap()
.into()
),
sample_res.image
);
assert_eq!(None, sample_res.embed_video_url);
assert_eq!(None, sample_res.content_type);
assert_eq!(None, sample_res.thumbnail);
}
// #[test]
@ -339,7 +325,7 @@ mod tests {
// root relative url
let html_bytes = b"<!DOCTYPE html><html><head><meta property='og:image' content='/image.jpg'></head><body></body></html>";
let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata");
let metadata = extract_opengraph_data(html_bytes, &url).expect("Unable to parse metadata");
assert_eq!(
metadata.image,
Some(Url::parse("https://example.com/image.jpg").unwrap().into())
@ -347,7 +333,7 @@ mod tests {
// base relative url
let html_bytes = b"<!DOCTYPE html><html><head><meta property='og:image' content='image.jpg'></head><body></body></html>";
let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata");
let metadata = extract_opengraph_data(html_bytes, &url).expect("Unable to parse metadata");
assert_eq!(
metadata.image,
Some(
@ -359,7 +345,7 @@ mod tests {
// absolute url
let html_bytes = b"<!DOCTYPE html><html><head><meta property='og:image' content='https://cdn.host.com/image.jpg'></head><body></body></html>";
let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata");
let metadata = extract_opengraph_data(html_bytes, &url).expect("Unable to parse metadata");
assert_eq!(
metadata.image,
Some(Url::parse("https://cdn.host.com/image.jpg").unwrap().into())
@ -367,7 +353,7 @@ mod tests {
// protocol relative url
let html_bytes = b"<!DOCTYPE html><html><head><meta property='og:image' content='//example.com/image.jpg'></head><body></body></html>";
let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata");
let metadata = extract_opengraph_data(html_bytes, &url).expect("Unable to parse metadata");
assert_eq!(
metadata.image,
Some(Url::parse("https://example.com/image.jpg").unwrap().into())

View File

@ -4,7 +4,7 @@ use lemmy_api_common::{
build_response::build_post_response,
context::LemmyContext,
post::{CreatePost, PostResponse},
request::fetch_site_data,
request::fetch_link_metadata,
send_activity::{ActivityChannel, SendActivityData},
utils::{
check_community_user_action,
@ -83,10 +83,12 @@ pub async fn create_post(
}
// Fetch post links and pictrs cached image
let (metadata_res, thumbnail_url) = fetch_site_data(data_url, true, &context).await;
let (embed_title, embed_description, embed_video_url) = metadata_res
.map(|u| (u.title, u.description, u.embed_video_url))
.unwrap_or_default();
let metadata = match data_url {
Some(url) => fetch_link_metadata(url, true, &context)
.await
.unwrap_or_default(),
_ => Default::default(),
};
// Only need to check if language is allowed in case user set it explicitly. When using default
// language, it already only returns allowed languages.
@ -117,11 +119,11 @@ pub async fn create_post(
.community_id(data.community_id)
.creator_id(local_user_view.person.id)
.nsfw(data.nsfw)
.embed_title(embed_title)
.embed_description(embed_description)
.embed_video_url(embed_video_url)
.embed_title(metadata.title)
.embed_description(metadata.description)
.embed_video_url(metadata.embed_video_url)
.language_id(language_id)
.thumbnail_url(thumbnail_url)
.thumbnail_url(metadata.thumbnail)
.build();
let inserted_post = Post::create(&mut context.pool(), &post_form)

View File

@ -4,7 +4,7 @@ use lemmy_api_common::{
build_response::build_post_response,
context::LemmyContext,
post::{EditPost, PostResponse},
request::fetch_site_data,
request::fetch_link_metadata,
send_activity::{ActivityChannel, SendActivityData},
utils::{check_community_user_action, local_site_to_slur_regex, process_markdown_opt},
};
@ -68,11 +68,12 @@ pub async fn update_post(
}
// Fetch post links and Pictrs cached image
let data_url = data.url.as_ref();
let (metadata_res, thumbnail_url) = fetch_site_data(data_url, true, &context).await;
let (embed_title, embed_description, embed_video_url) = metadata_res
.map(|u| (Some(u.title), Some(u.description), Some(u.embed_video_url)))
.unwrap_or_default();
let metadata = match data_url {
Some(url) => fetch_link_metadata(url, true, &context)
.await
.unwrap_or_default(),
_ => Default::default(),
};
let language_id = data.language_id;
CommunityLanguage::is_allowed_community_language(
@ -87,11 +88,11 @@ pub async fn update_post(
url,
body: diesel_option_overwrite(body),
nsfw: data.nsfw,
embed_title,
embed_description,
embed_video_url,
embed_title: Some(metadata.title),
embed_description: Some(metadata.description),
embed_video_url: Some(metadata.embed_video_url),
language_id: data.language_id,
thumbnail_url: Some(thumbnail_url),
thumbnail_url: Some(metadata.thumbnail),
updated: Some(Some(naive_now())),
..Default::default()
};

View File

@ -24,7 +24,7 @@ use chrono::{DateTime, Utc};
use html2text::{from_read_with_decorator, render::text_renderer::TrivialDecorator};
use lemmy_api_common::{
context::LemmyContext,
request::fetch_site_data,
request::fetch_link_metadata,
utils::{
is_mod_or_admin,
local_site_opt_to_sensitive,
@ -216,16 +216,12 @@ impl Object for ApubPost {
// Only fetch metadata if the post has a url and was not seen previously. We dont want to
// waste resources by fetching metadata for the same post multiple times.
// Additionally, only fetch image if content is not sensitive or is allowed on local site.
let (metadata_res, thumbnail) = match &url {
Some(url) if old_post.is_err() => fetch_site_data(Some(url), include_image, &context).await,
_ => (None, None),
let metadata = match &url {
Some(url) => fetch_link_metadata(url, include_image, context)
.await
.unwrap_or_default(),
_ => Default::default(),
};
// If no image was included with metadata, use post image instead when available.
let thumbnail_url = thumbnail.or_else(|| page.image.map(|i| i.url.into()));
let (embed_title, embed_description, embed_video_url) = metadata_res
.map(|u| (u.title, u.description, u.embed_video_url))
.unwrap_or_default();
let slur_regex = &local_site_opt_to_slur_regex(&local_site);
let body = read_from_string_or_source_opt(&page.content, &page.media_type, &page.source);
@ -245,10 +241,10 @@ impl Object for ApubPost {
updated: page.updated.map(Into::into),
deleted: Some(false),
nsfw: page.sensitive,
embed_title,
embed_description,
embed_video_url,
thumbnail_url,
embed_title: metadata.title,
embed_description: metadata.description,
embed_video_url: metadata.embed_video_url,
thumbnail_url: metadata.thumbnail,
ap_id: Some(page.id.clone().into()),
local: Some(false),
language_id,