lemmy/lemmy_apub/src/fetcher.rs

476 lines
15 KiB
Rust
Raw Normal View History

2020-05-16 14:04:08 +00:00
use crate::{
check_is_apub_id_valid,
objects::FromApub,
ActorType,
GroupExt,
NoteExt,
PageExt,
PersonExt,
APUB_JSON_CONTENT_TYPE,
2020-05-16 14:04:08 +00:00
};
use activitystreams::{base::BaseExt, collection::OrderedCollection, prelude::*};
use anyhow::{anyhow, Context};
use chrono::NaiveDateTime;
use diesel::result::Error::NotFound;
use lemmy_db_queries::{source::user::User, ApubObject, Crud, Joinable, SearchType};
use lemmy_db_schema::{
naive_now,
2020-12-21 12:28:12 +00:00
source::{
comment::Comment,
community::{Community, CommunityModerator, CommunityModeratorForm},
post::Post,
user::User_,
},
};
use lemmy_db_views::{comment_view::CommentView, post_view::PostView};
use lemmy_db_views_actor::{community_view::CommunityView, user_view::UserViewSafe};
2020-09-16 13:31:30 +00:00
use lemmy_structs::{blocking, site::SearchResponse};
use lemmy_utils::{
location_info,
request::{retry, RecvError},
settings::Settings,
LemmyError,
};
use lemmy_websocket::LemmyContext;
use log::debug;
use reqwest::Client;
use serde::Deserialize;
use std::{fmt::Debug, time::Duration};
use url::Url;
static ACTOR_REFETCH_INTERVAL_SECONDS: i64 = 24 * 60 * 60;
2020-07-24 15:07:33 +00:00
static ACTOR_REFETCH_INTERVAL_SECONDS_DEBUG: i64 = 10;
/// Maximum number of HTTP requests allowed to handle a single incoming activity (or a single object
/// fetch through the search).
///
/// Tests are passing with a value of 5, so 10 should be safe for production.
static MAX_REQUEST_NUMBER: i32 = 10;
2020-04-17 15:33:55 +00:00
/// Fetch any type of ActivityPub object, handling things like HTTP headers, deserialisation,
/// timeouts etc.
async fn fetch_remote_object<Response>(
client: &Client,
url: &Url,
recursion_counter: &mut i32,
) -> Result<Response, LemmyError>
where
Response: for<'de> Deserialize<'de>,
{
*recursion_counter += 1;
if *recursion_counter > MAX_REQUEST_NUMBER {
return Err(anyhow!("Maximum recursion depth reached").into());
}
check_is_apub_id_valid(&url)?;
let timeout = Duration::from_secs(60);
let json = retry(|| {
client
.get(url.as_str())
.header("Accept", APUB_JSON_CONTENT_TYPE)
.timeout(timeout)
.send()
})
.await?
.json()
.await
.map_err(|e| {
debug!("Receive error, {}", e);
RecvError(e.to_string())
})?;
Ok(json)
}
2020-04-17 15:33:55 +00:00
/// The types of ActivityPub objects that can be fetched directly by searching for their ID.
2020-04-17 13:46:08 +00:00
#[serde(untagged)]
2020-04-17 17:34:18 +00:00
#[derive(serde::Deserialize, Debug)]
enum SearchAcceptedObjects {
2020-04-17 13:46:08 +00:00
Person(Box<PersonExt>),
Group(Box<GroupExt>),
2020-05-05 00:04:48 +00:00
Page(Box<PageExt>),
Comment(Box<NoteExt>),
2020-04-17 13:46:08 +00:00
}
2020-04-17 15:33:55 +00:00
/// Attempt to parse the query as URL, and fetch an ActivityPub object from it.
///
2020-10-19 14:29:35 +00:00
/// Some working examples for use with the `docker/federation/` setup:
/// http://lemmy_alpha:8541/c/main, or !main@lemmy_alpha:8541
/// http://lemmy_beta:8551/u/lemmy_alpha, or @lemmy_beta@lemmy_beta:8551
/// http://lemmy_gamma:8561/post/3
/// http://lemmy_delta:8571/comment/2
pub async fn search_by_apub_id(
query: &str,
context: &LemmyContext,
) -> Result<SearchResponse, LemmyError> {
// Parse the shorthand query url
let query_url = if query.contains('@') {
debug!("Search for {}", query);
let split = query.split('@').collect::<Vec<&str>>();
// User type will look like ['', username, instance]
// Community will look like [!community, instance]
let (name, instance) = if split.len() == 3 {
(format!("/u/{}", split[1]), split[2])
} else if split.len() == 2 {
if split[0].contains('!') {
let split2 = split[0].split('!').collect::<Vec<&str>>();
(format!("/c/{}", split2[1]), split[1])
} else {
return Err(anyhow!("Invalid search query: {}", query).into());
}
} else {
return Err(anyhow!("Invalid search query: {}", query).into());
};
let url = format!(
"{}://{}{}",
Settings::get().get_protocol_string(),
instance,
name
);
Url::parse(&url)?
} else {
Url::parse(&query)?
};
2020-04-17 13:46:08 +00:00
let mut response = SearchResponse {
type_: SearchType::All.to_string(),
comments: vec![],
posts: vec![],
communities: vec![],
users: vec![],
};
let domain = query_url.domain().context("url has no domain")?;
let recursion_counter = &mut 0;
let response = match fetch_remote_object::<SearchAcceptedObjects>(
context.client(),
&query_url,
recursion_counter,
)
.await?
{
SearchAcceptedObjects::Person(p) => {
let user_uri = p.inner.id(domain)?.context("person has no id")?;
let user = get_or_fetch_and_upsert_user(&user_uri, context, recursion_counter).await?;
response.users = vec![
blocking(context.pool(), move |conn| {
2020-12-04 00:47:58 +00:00
UserViewSafe::read(conn, user.id)
})
.await??,
];
response
}
SearchAcceptedObjects::Group(g) => {
let community_uri = g.inner.id(domain)?.context("group has no id")?;
let community =
get_or_fetch_and_upsert_community(community_uri, context, recursion_counter).await?;
response.communities = vec![
blocking(context.pool(), move |conn| {
CommunityView::read(conn, community.id, None)
})
.await??,
];
response
}
SearchAcceptedObjects::Page(p) => {
let p = Post::from_apub(&p, context, query_url, recursion_counter).await?;
response.posts =
vec![blocking(context.pool(), move |conn| PostView::read(conn, p.id, None)).await??];
response
}
SearchAcceptedObjects::Comment(c) => {
let c = Comment::from_apub(&c, context, query_url, recursion_counter).await?;
response.comments = vec![
blocking(context.pool(), move |conn| {
CommentView::read(conn, c.id, None)
})
.await??,
];
response
}
};
2020-04-17 13:46:08 +00:00
Ok(response)
}
2020-10-19 14:29:35 +00:00
/// Get a remote actor from its apub ID (either a user or a community). Thin wrapper around
/// `get_or_fetch_and_upsert_user()` and `get_or_fetch_and_upsert_community()`.
///
/// If it exists locally and `!should_refetch_actor()`, it is returned directly from the database.
/// Otherwise it is fetched from the remote instance, stored and returned.
pub(crate) async fn get_or_fetch_and_upsert_actor(
apub_id: &Url,
context: &LemmyContext,
recursion_counter: &mut i32,
) -> Result<Box<dyn ActorType>, LemmyError> {
let community = get_or_fetch_and_upsert_community(apub_id, context, recursion_counter).await;
2020-10-19 14:29:35 +00:00
let actor: Box<dyn ActorType> = match community {
Ok(c) => Box::new(c),
Err(_) => Box::new(get_or_fetch_and_upsert_user(apub_id, context, recursion_counter).await?),
};
Ok(actor)
}
2020-10-19 14:29:35 +00:00
/// Get a user from its apub ID.
///
/// If it exists locally and `!should_refetch_actor()`, it is returned directly from the database.
/// Otherwise it is fetched from the remote instance, stored and returned.
pub(crate) async fn get_or_fetch_and_upsert_user(
apub_id: &Url,
context: &LemmyContext,
recursion_counter: &mut i32,
) -> Result<User_, LemmyError> {
let apub_id_owned = apub_id.to_owned();
let user = blocking(context.pool(), move |conn| {
User_::read_from_apub_id(conn, apub_id_owned.as_ref())
})
.await?;
match user {
// If its older than a day, re-fetch it
Ok(u) if !u.local && should_refetch_actor(u.last_refreshed_at) => {
debug!("Fetching and updating from remote user: {}", apub_id);
let person =
fetch_remote_object::<PersonExt>(context.client(), apub_id, recursion_counter).await;
// If fetching failed, return the existing data.
if person.is_err() {
return Ok(u);
}
let user = User_::from_apub(&person?, context, apub_id.to_owned(), recursion_counter).await?;
let user_id = user.id;
blocking(context.pool(), move |conn| {
User_::mark_as_updated(conn, user_id)
})
.await??;
Ok(user)
2020-04-24 19:55:54 +00:00
}
Ok(u) => Ok(u),
2020-04-24 14:04:36 +00:00
Err(NotFound {}) => {
debug!("Fetching and creating remote user: {}", apub_id);
let person =
fetch_remote_object::<PersonExt>(context.client(), apub_id, recursion_counter).await?;
let user = User_::from_apub(&person, context, apub_id.to_owned(), recursion_counter).await?;
Ok(user)
2020-04-24 14:04:36 +00:00
}
Err(e) => Err(e.into()),
2020-04-24 14:04:36 +00:00
}
2020-04-07 21:02:32 +00:00
}
2020-04-17 13:46:08 +00:00
/// Determines when a remote actor should be refetched from its instance. In release builds, this is
2020-10-19 14:29:35 +00:00
/// `ACTOR_REFETCH_INTERVAL_SECONDS` after the last refetch, in debug builds
/// `ACTOR_REFETCH_INTERVAL_SECONDS_DEBUG`.
///
/// TODO it won't pick up new avatars, summaries etc until a day after.
/// Actors need an "update" activity pushed to other servers to fix this.
fn should_refetch_actor(last_refreshed: NaiveDateTime) -> bool {
2020-07-24 15:07:33 +00:00
let update_interval = if cfg!(debug_assertions) {
// avoid infinite loop when fetching community outbox
chrono::Duration::seconds(ACTOR_REFETCH_INTERVAL_SECONDS_DEBUG)
} else {
2020-07-24 15:07:33 +00:00
chrono::Duration::seconds(ACTOR_REFETCH_INTERVAL_SECONDS)
};
last_refreshed.lt(&(naive_now() - update_interval))
}
2020-10-19 14:29:35 +00:00
/// Get a community from its apub ID.
///
/// If it exists locally and `!should_refetch_actor()`, it is returned directly from the database.
/// Otherwise it is fetched from the remote instance, stored and returned.
pub(crate) async fn get_or_fetch_and_upsert_community(
apub_id: &Url,
context: &LemmyContext,
recursion_counter: &mut i32,
) -> Result<Community, LemmyError> {
let apub_id_owned = apub_id.to_owned();
let community = blocking(context.pool(), move |conn| {
Community::read_from_apub_id(conn, apub_id_owned.as_str())
})
.await?;
match community {
Ok(c) if !c.local && should_refetch_actor(c.last_refreshed_at) => {
debug!("Fetching and updating from remote community: {}", apub_id);
fetch_remote_community(apub_id, context, Some(c), recursion_counter).await
2020-04-24 19:55:54 +00:00
}
Ok(c) => Ok(c),
2020-04-24 14:04:36 +00:00
Err(NotFound {}) => {
debug!("Fetching and creating remote community: {}", apub_id);
fetch_remote_community(apub_id, context, None, recursion_counter).await
2020-07-24 15:07:33 +00:00
}
Err(e) => Err(e.into()),
}
}
/// Request a community by apub ID from a remote instance, including moderators. If `old_community`,
2020-10-19 14:29:35 +00:00
/// is set, this is an update for a community which is already known locally. If not, we don't know
/// the community yet and also pull the outbox, to get some initial posts.
2020-07-24 15:07:33 +00:00
async fn fetch_remote_community(
apub_id: &Url,
context: &LemmyContext,
old_community: Option<Community>,
recursion_counter: &mut i32,
2020-07-24 15:07:33 +00:00
) -> Result<Community, LemmyError> {
let group = fetch_remote_object::<GroupExt>(context.client(), apub_id, recursion_counter).await;
// If fetching failed, return the existing data.
if let Some(ref c) = old_community {
if group.is_err() {
return Ok(c.to_owned());
}
}
2020-07-24 15:07:33 +00:00
let group = group?;
let community =
Community::from_apub(&group, context, apub_id.to_owned(), recursion_counter).await?;
2020-07-24 15:07:33 +00:00
// Also add the community moderators too
let attributed_to = group.inner.attributed_to().context(location_info!())?;
2020-07-24 15:07:33 +00:00
let creator_and_moderator_uris: Vec<&Url> = attributed_to
.as_many()
.context(location_info!())?
2020-07-24 15:07:33 +00:00
.iter()
.map(|a| a.as_xsd_any_uri().context(""))
.collect::<Result<Vec<&Url>, anyhow::Error>>()?;
2020-05-03 14:00:59 +00:00
2020-07-24 15:07:33 +00:00
let mut creator_and_moderators = Vec::new();
2020-07-24 15:07:33 +00:00
for uri in creator_and_moderator_uris {
let c_or_m = get_or_fetch_and_upsert_user(uri, context, recursion_counter).await?;
2020-07-24 15:07:33 +00:00
creator_and_moderators.push(c_or_m);
}
2020-07-24 15:07:33 +00:00
// TODO: need to make this work to update mods of existing communities
if old_community.is_none() {
2020-07-24 15:07:33 +00:00
let community_id = community.id;
blocking(context.pool(), move |conn| {
2020-07-24 15:07:33 +00:00
for mod_ in creator_and_moderators {
let community_moderator_form = CommunityModeratorForm {
community_id,
user_id: mod_.id,
};
CommunityModerator::join(conn, &community_moderator_form)?;
2020-05-03 14:00:59 +00:00
}
2020-07-24 15:07:33 +00:00
Ok(()) as Result<(), LemmyError>
})
.await??;
}
2020-05-03 14:00:59 +00:00
2020-07-24 15:07:33 +00:00
// fetch outbox (maybe make this conditional)
let outbox = fetch_remote_object::<OrderedCollection>(
context.client(),
&community.get_outbox_url()?,
recursion_counter,
)
.await?;
let outbox_items = outbox.items().context(location_info!())?.clone();
2020-08-14 14:50:21 +00:00
let mut outbox_items = outbox_items.many().context(location_info!())?;
if outbox_items.len() > 20 {
outbox_items = outbox_items[0..20].to_vec();
}
for o in outbox_items {
let page = PageExt::from_any_base(o)?.context(location_info!())?;
let page_id = page.id_unchecked().context(location_info!())?;
// The post creator may be from a blocked instance, if it errors, then skip it
if check_is_apub_id_valid(page_id).is_err() {
continue;
}
Post::from_apub(&page, context, page_id.to_owned(), recursion_counter).await?;
2020-08-07 13:15:44 +00:00
// TODO: we need to send a websocket update here
2020-04-24 14:04:36 +00:00
}
2020-07-24 15:07:33 +00:00
Ok(community)
2020-04-09 19:04:31 +00:00
}
2020-04-24 19:55:54 +00:00
2020-10-19 14:29:35 +00:00
/// Gets a post by its apub ID. If it exists locally, it is returned directly. Otherwise it is
/// pulled from its apub ID, inserted and returned.
///
/// The parent community is also pulled if necessary. Comments are not pulled.
pub(crate) async fn get_or_fetch_and_insert_post(
post_ap_id: &Url,
context: &LemmyContext,
recursion_counter: &mut i32,
) -> Result<Post, LemmyError> {
let post_ap_id_owned = post_ap_id.to_owned();
let post = blocking(context.pool(), move |conn| {
Post::read_from_apub_id(conn, post_ap_id_owned.as_str())
})
.await?;
match post {
Ok(p) => Ok(p),
Err(NotFound {}) => {
debug!("Fetching and creating remote post: {}", post_ap_id);
let page =
fetch_remote_object::<PageExt>(context.client(), post_ap_id, recursion_counter).await?;
let post = Post::from_apub(&page, context, post_ap_id.to_owned(), recursion_counter).await?;
Ok(post)
}
Err(e) => Err(e.into()),
}
}
2020-10-19 14:29:35 +00:00
/// Gets a comment by its apub ID. If it exists locally, it is returned directly. Otherwise it is
/// pulled from its apub ID, inserted and returned.
///
/// The parent community, post and comment are also pulled if necessary.
pub(crate) async fn get_or_fetch_and_insert_comment(
comment_ap_id: &Url,
context: &LemmyContext,
recursion_counter: &mut i32,
) -> Result<Comment, LemmyError> {
let comment_ap_id_owned = comment_ap_id.to_owned();
let comment = blocking(context.pool(), move |conn| {
Comment::read_from_apub_id(conn, comment_ap_id_owned.as_str())
})
.await?;
match comment {
Ok(p) => Ok(p),
Err(NotFound {}) => {
debug!(
"Fetching and creating remote comment and its parents: {}",
comment_ap_id
);
let comment =
fetch_remote_object::<NoteExt>(context.client(), comment_ap_id, recursion_counter).await?;
let comment = Comment::from_apub(
&comment,
context,
comment_ap_id.to_owned(),
recursion_counter,
)
.await?;
let post_id = comment.post_id;
2020-11-09 16:06:54 +00:00
let post = blocking(context.pool(), move |conn| Post::read(conn, post_id)).await??;
if post.locked {
return Err(anyhow!("Post is locked").into());
}
Ok(comment)
}
Err(e) => Err(e.into()),
}
}