Trying another tokenizer fix. #1964

another_tokenizer_fix
Dessalines 2021-12-20 16:57:36 -05:00
parent c883a49a40
commit ad4715c2a3
1 changed files with 24 additions and 4 deletions

View File

@ -5,7 +5,7 @@ use reqwest_middleware::ClientWithMiddleware;
use serde::{Deserialize, Serialize};
use std::future::Future;
use thiserror::Error;
use tracing::error;
use tracing::{error, info};
use url::Url;
use webpage::HTML;
@ -64,12 +64,18 @@ pub async fn fetch_site_metadata(
client: &ClientWithMiddleware,
url: &Url,
) -> Result<SiteMetadata, LemmyError> {
info!("Fetching site metadata for url: {}", url);
let response = client.get(url.as_str()).send().await?;
let html = response
.text()
// Can't use .text() here, because it only checks the content header, not the actual bytes
// https://github.com/LemmyNet/lemmy/issues/1964
let html_bytes = response
.bytes()
.await
.map_err(|e| RecvError(e.to_string()))?;
.map_err(|e| RecvError(e.to_string()))?
.to_vec();
let html = String::from_utf8_lossy(&html_bytes);
let tags = html_to_site_metadata(&html)?;
@ -77,6 +83,20 @@ pub async fn fetch_site_metadata(
}
fn html_to_site_metadata(html: &str) -> Result<SiteMetadata, LemmyError> {
// Make sure the first line is doctype html
let first_line = html
.lines()
.into_iter()
.next()
.ok_or_else(|| LemmyError::from_message("No lines in html"))?
.to_lowercase();
if !first_line.starts_with("<!doctype html>") {
return Err(LemmyError::from_message(
"Site metadata page fetch is not DOCTYPE html",
));
}
let page = HTML::from_string(html.to_string(), None)?;
let page_title = page.title;