we don't need bloody bs4 to get a bloody title

2022-11-11 03:24:54 -06:00 · 2022-11-11 03:24:54 -06:00 · f5912bb4cc
parent d35dd7617c
commit f5912bb4cc
2 changed files with 7 additions and 4 deletions
--- a/files/helpers/regex.py
+++ b/files/helpers/regex.py
@ -116,6 +116,8 @@ pronouns_regex = re.compile("([a-z]{1,5})/[a-z]{1,5}(/[a-z]{1,5})?", flags=re.A|

 knowledgebase_page_regex = re.compile("[a-zA-Z0-9_\-]+", flags=re.A)

+html_title_regex = re.compile("<title>(.{1,200})</title>", flags=re.A|re.I)
+
 def sub_matcher(match:re.Match, upper=False, replace_with:Union[dict[str, str], dict[str, List[str]]]=SLURS):
 	group_num = 0
 	match_str = match.group(group_num)
--- a/files/routes/posts.py
+++ b/files/routes/posts.py
@ -1066,7 +1066,6 @@ extensions = IMAGE_FORMATS + VIDEO_FORMATS + AUDIO_FORMATS
@limiter.limit("3/minute", key_func=lambda:f'{SITE}-{session.get("lo_user")}')
@auth_required
 def get_post_title(v):
-
 	url = request.values.get("url")
 	if not url or '\\' in url: abort(400)

@ -1080,9 +1079,11 @@ def get_post_title(v):
 	content_type = x.headers.get("Content-Type")
 	if not content_type or "text/html" not in content_type: abort(400)

-	soup = BeautifulSoup(x.content, 'lxml')
-
-	title = soup.find('title')
+	# no you can't just parse html with reeeeeeeegex
+	match = html_title_regex.match(x.content)
+	if match and match.lastindex >= 1:
+		title = match.group(1)
+	
 	if not title: abort(400)

 	return {"url": url, "title": title.string}