mirror of https://github.com/LemmyNet/lemmy.git
Sitemap (#3808)
* generate sitemap.xml file * set up endpoint for sitemap * Update sitemap generation - remove sitemap generation from scheduled tasks - add posts query for sitemap - create sitemap module in API crate * remove priority and change freq from sitemap * add configuration option for number of posts for sitemap * fix default config * rate limit sitemap endpoint * update sitemap query * update sitemap generation - remove config value for query limit - adjust sitemap generation to query changes - tidy up error handling * refactor sitemap generation loop * remove `limit` argument * refactor `generate_urlset` and add unit test * change query to only fetch local posts of past 24h * fix outdated comment and log * cargo fmtpull/3914/head
parent
ab828b81e4
commit
28324ad2c8
|
@ -1603,6 +1603,15 @@ version = "1.8.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797"
|
checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "elementtree"
|
||||||
|
version = "1.2.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3efd4742acf458718a6456e0adf0b4d734d6b783e452bbf1ac36bf31f4085cb3"
|
||||||
|
dependencies = [
|
||||||
|
"string_cache",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "email-encoding"
|
name = "email-encoding"
|
||||||
version = "0.2.0"
|
version = "0.2.0"
|
||||||
|
@ -2581,6 +2590,7 @@ dependencies = [
|
||||||
"bcrypt",
|
"bcrypt",
|
||||||
"captcha",
|
"captcha",
|
||||||
"chrono",
|
"chrono",
|
||||||
|
"elementtree",
|
||||||
"lemmy_api_common",
|
"lemmy_api_common",
|
||||||
"lemmy_db_schema",
|
"lemmy_db_schema",
|
||||||
"lemmy_db_views",
|
"lemmy_db_views",
|
||||||
|
@ -2589,8 +2599,10 @@ dependencies = [
|
||||||
"lemmy_utils",
|
"lemmy_utils",
|
||||||
"serde",
|
"serde",
|
||||||
"serial_test",
|
"serial_test",
|
||||||
|
"sitemap-rs",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tracing",
|
"tracing",
|
||||||
|
"url",
|
||||||
"uuid",
|
"uuid",
|
||||||
"wav",
|
"wav",
|
||||||
]
|
]
|
||||||
|
@ -4745,6 +4757,16 @@ version = "0.3.10"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de"
|
checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "sitemap-rs"
|
||||||
|
version = "0.2.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "95b58125f0ab4317b5ba3cdc1f60696e47958760e356874c759334fa56ae1596"
|
||||||
|
dependencies = [
|
||||||
|
"chrono",
|
||||||
|
"xml-builder",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "skeptic"
|
name = "skeptic"
|
||||||
version = "0.13.7"
|
version = "0.13.7"
|
||||||
|
@ -6132,6 +6154,12 @@ dependencies = [
|
||||||
"winapi",
|
"winapi",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "xml-builder"
|
||||||
|
version = "0.5.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "efc4f1a86af7800dfc4056c7833648ea4515ae21502060b5c98114d828f5333b"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "xml5ever"
|
name = "xml5ever"
|
||||||
version = "0.17.0"
|
version = "0.17.0"
|
||||||
|
|
|
@ -31,8 +31,11 @@ captcha = { workspace = true }
|
||||||
anyhow = { workspace = true }
|
anyhow = { workspace = true }
|
||||||
tracing = { workspace = true }
|
tracing = { workspace = true }
|
||||||
chrono = { workspace = true }
|
chrono = { workspace = true }
|
||||||
|
url = { workspace = true }
|
||||||
wav = "1.0.0"
|
wav = "1.0.0"
|
||||||
|
sitemap-rs = "0.2.0"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
serial_test = { workspace = true }
|
serial_test = { workspace = true }
|
||||||
tokio = { workspace = true }
|
tokio = { workspace = true }
|
||||||
|
elementtree = "1.2.3"
|
||||||
|
|
|
@ -18,6 +18,7 @@ pub mod post_report;
|
||||||
pub mod private_message;
|
pub mod private_message;
|
||||||
pub mod private_message_report;
|
pub mod private_message_report;
|
||||||
pub mod site;
|
pub mod site;
|
||||||
|
pub mod sitemap;
|
||||||
|
|
||||||
#[async_trait::async_trait(?Send)]
|
#[async_trait::async_trait(?Send)]
|
||||||
pub trait Perform {
|
pub trait Perform {
|
||||||
|
|
|
@ -0,0 +1,142 @@
|
||||||
|
use actix_web::{
|
||||||
|
http::header::{self, CacheDirective},
|
||||||
|
web::Data,
|
||||||
|
HttpResponse,
|
||||||
|
};
|
||||||
|
use chrono::{DateTime, FixedOffset};
|
||||||
|
use lemmy_api_common::context::LemmyContext;
|
||||||
|
use lemmy_db_schema::{newtypes::DbUrl, source::post::Post};
|
||||||
|
use lemmy_utils::error::LemmyResult;
|
||||||
|
use sitemap_rs::{url::Url, url_set::UrlSet};
|
||||||
|
use tracing::info;
|
||||||
|
|
||||||
|
async fn generate_urlset(posts: Vec<(DbUrl, chrono::NaiveDateTime)>) -> LemmyResult<UrlSet> {
|
||||||
|
let urls = posts
|
||||||
|
.into_iter()
|
||||||
|
.map_while(|post| {
|
||||||
|
Url::builder(post.0.to_string())
|
||||||
|
.last_modified(DateTime::from_utc(
|
||||||
|
post.1,
|
||||||
|
FixedOffset::east_opt(0).expect("Error setting timezone offset"), // TODO what is the proper timezone offset here?
|
||||||
|
))
|
||||||
|
.build()
|
||||||
|
.ok()
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
Ok(UrlSet::new(urls)?)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn get_sitemap(context: Data<LemmyContext>) -> LemmyResult<HttpResponse> {
|
||||||
|
info!("Generating sitemap with posts from last {} hours...", 24);
|
||||||
|
let posts = Post::list_for_sitemap(&mut context.pool()).await?;
|
||||||
|
info!("Loaded latest {} posts", posts.len());
|
||||||
|
|
||||||
|
let mut buf = Vec::<u8>::new();
|
||||||
|
generate_urlset(posts).await?.write(&mut buf)?;
|
||||||
|
|
||||||
|
Ok(
|
||||||
|
HttpResponse::Ok()
|
||||||
|
.content_type("application/xml")
|
||||||
|
.insert_header(header::CacheControl(vec![CacheDirective::MaxAge(86_400)])) // 24 h
|
||||||
|
.body(buf),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
pub(crate) mod tests {
|
||||||
|
#![allow(clippy::unwrap_used)]
|
||||||
|
|
||||||
|
use crate::sitemap::generate_urlset;
|
||||||
|
use chrono::{NaiveDate, NaiveDateTime};
|
||||||
|
use elementtree::Element;
|
||||||
|
use lemmy_db_schema::newtypes::DbUrl;
|
||||||
|
use url::Url;
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_generate_urlset() {
|
||||||
|
let posts: Vec<(DbUrl, NaiveDateTime)> = vec![
|
||||||
|
(
|
||||||
|
Url::parse("https://example.com").unwrap().into(),
|
||||||
|
NaiveDate::from_ymd_opt(2022, 12, 1)
|
||||||
|
.unwrap()
|
||||||
|
.and_hms_opt(9, 10, 11)
|
||||||
|
.unwrap(),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
Url::parse("https://lemmy.ml").unwrap().into(),
|
||||||
|
NaiveDate::from_ymd_opt(2023, 1, 1)
|
||||||
|
.unwrap()
|
||||||
|
.and_hms_opt(1, 2, 3)
|
||||||
|
.unwrap(),
|
||||||
|
),
|
||||||
|
];
|
||||||
|
|
||||||
|
let mut buf = Vec::<u8>::new();
|
||||||
|
generate_urlset(posts)
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
.write(&mut buf)
|
||||||
|
.unwrap();
|
||||||
|
let root = Element::from_reader(buf.as_slice()).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(root.tag().name(), "urlset");
|
||||||
|
assert_eq!(root.child_count(), 2);
|
||||||
|
|
||||||
|
assert!(root.children().all(|url| url.tag().name() == "url"));
|
||||||
|
assert!(root.children().all(|url| url.child_count() == 2));
|
||||||
|
assert!(root.children().all(|url| url
|
||||||
|
.children()
|
||||||
|
.next()
|
||||||
|
.is_some_and(|element| element.tag().name() == "loc")));
|
||||||
|
assert!(root.children().all(|url| url
|
||||||
|
.children()
|
||||||
|
.nth(1)
|
||||||
|
.is_some_and(|element| element.tag().name() == "lastmod")));
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
root
|
||||||
|
.children()
|
||||||
|
.next()
|
||||||
|
.unwrap()
|
||||||
|
.children()
|
||||||
|
.find(|element| element.tag().name() == "loc")
|
||||||
|
.unwrap()
|
||||||
|
.text(),
|
||||||
|
"https://example.com/"
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
root
|
||||||
|
.children()
|
||||||
|
.next()
|
||||||
|
.unwrap()
|
||||||
|
.children()
|
||||||
|
.find(|element| element.tag().name() == "lastmod")
|
||||||
|
.unwrap()
|
||||||
|
.text(),
|
||||||
|
"2022-12-01T09:10:11+00:00"
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
root
|
||||||
|
.children()
|
||||||
|
.nth(1)
|
||||||
|
.unwrap()
|
||||||
|
.children()
|
||||||
|
.find(|element| element.tag().name() == "loc")
|
||||||
|
.unwrap()
|
||||||
|
.text(),
|
||||||
|
"https://lemmy.ml/"
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
root
|
||||||
|
.children()
|
||||||
|
.nth(1)
|
||||||
|
.unwrap()
|
||||||
|
.children()
|
||||||
|
.find(|element| element.tag().name() == "lastmod")
|
||||||
|
.unwrap()
|
||||||
|
.text(),
|
||||||
|
"2023-01-01T01:02:03+00:00"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,3 +1,4 @@
|
||||||
|
use super::instance::coalesce;
|
||||||
use crate::{
|
use crate::{
|
||||||
newtypes::{CommunityId, DbUrl, PersonId, PostId},
|
newtypes::{CommunityId, DbUrl, PersonId, PostId},
|
||||||
schema::post::dsl::{
|
schema::post::dsl::{
|
||||||
|
@ -7,6 +8,7 @@ use crate::{
|
||||||
creator_id,
|
creator_id,
|
||||||
deleted,
|
deleted,
|
||||||
featured_community,
|
featured_community,
|
||||||
|
local,
|
||||||
name,
|
name,
|
||||||
post,
|
post,
|
||||||
published,
|
published,
|
||||||
|
@ -30,6 +32,7 @@ use crate::{
|
||||||
utils::{get_conn, naive_now, DbPool, DELETED_REPLACEMENT_TEXT, FETCH_LIMIT_MAX},
|
utils::{get_conn, naive_now, DbPool, DELETED_REPLACEMENT_TEXT, FETCH_LIMIT_MAX},
|
||||||
};
|
};
|
||||||
use ::url::Url;
|
use ::url::Url;
|
||||||
|
use chrono::{Duration, Utc};
|
||||||
use diesel::{dsl::insert_into, result::Error, ExpressionMethods, QueryDsl, TextExpressionMethods};
|
use diesel::{dsl::insert_into, result::Error, ExpressionMethods, QueryDsl, TextExpressionMethods};
|
||||||
use diesel_async::RunQueryDsl;
|
use diesel_async::RunQueryDsl;
|
||||||
|
|
||||||
|
@ -96,6 +99,21 @@ impl Post {
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn list_for_sitemap(
|
||||||
|
pool: &mut DbPool<'_>,
|
||||||
|
) -> Result<Vec<(DbUrl, chrono::NaiveDateTime)>, Error> {
|
||||||
|
let conn = &mut get_conn(pool).await?;
|
||||||
|
post
|
||||||
|
.select((ap_id, coalesce(updated, published)))
|
||||||
|
.filter(local)
|
||||||
|
.filter(deleted.eq(false))
|
||||||
|
.filter(removed.eq(false))
|
||||||
|
.filter(published.ge(Utc::now().naive_utc() - Duration::days(1)))
|
||||||
|
.order(published.desc())
|
||||||
|
.load::<(DbUrl, chrono::NaiveDateTime)>(conn)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn permadelete_for_creator(
|
pub async fn permadelete_for_creator(
|
||||||
pool: &mut DbPool<'_>,
|
pool: &mut DbPool<'_>,
|
||||||
for_creator_id: PersonId,
|
for_creator_id: PersonId,
|
||||||
|
|
|
@ -16,6 +16,7 @@ use lemmy_api::{
|
||||||
local_user::{ban_person::ban_from_site, notifications::mark_reply_read::mark_reply_as_read},
|
local_user::{ban_person::ban_from_site, notifications::mark_reply_read::mark_reply_as_read},
|
||||||
post::{feature::feature_post, like::like_post, lock::lock_post},
|
post::{feature::feature_post, like::like_post, lock::lock_post},
|
||||||
post_report::create::create_post_report,
|
post_report::create::create_post_report,
|
||||||
|
sitemap::get_sitemap,
|
||||||
Perform,
|
Perform,
|
||||||
};
|
};
|
||||||
use lemmy_api_common::{
|
use lemmy_api_common::{
|
||||||
|
@ -340,6 +341,11 @@ pub fn config(cfg: &mut web::ServiceConfig, rate_limit: &RateLimitCell) {
|
||||||
.route("/delete", web::post().to(delete_custom_emoji)),
|
.route("/delete", web::post().to(delete_custom_emoji)),
|
||||||
),
|
),
|
||||||
);
|
);
|
||||||
|
cfg.service(
|
||||||
|
web::scope("/sitemap.xml")
|
||||||
|
.wrap(rate_limit.message())
|
||||||
|
.route("", web::get().to(get_sitemap)),
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn perform<'a, Data>(
|
async fn perform<'a, Data>(
|
||||||
|
|
Loading…
Reference in New Issue