Fix URL regex and load tokenizer locally.

master
float-trip 2023-07-09 23:28:15 +00:00
parent db9d8a706e
commit b235222ac2
1 changed files with 7 additions and 4 deletions

View File

@ -8,9 +8,12 @@ from config import config
from maxsubstring import longest_common_substring from maxsubstring import longest_common_substring
URL_REGEX = ( URL_REGEX = (
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
)
tokenizer = GPTNeoXTokenizerFast.from_pretrained(
f"{config['data_dir']}/mpt-30b-drama-ba678"
) )
tokenizer = GPTNeoXTokenizerFast.from_pretrained("mosaicml/mpt-7b")
def remove_notifications(text): def remove_notifications(text):
@ -165,8 +168,8 @@ def extract_reply(text):
- End with the start of a new comment: `float-trip +10` - End with the start of a new comment: `float-trip +10`
For the latter case, drop the last line. For the latter case, drop the last line.
""" """
pattern = r"^ *[\w-]* +\+.*$" new_comment_pattern = r"^ *[\w-]* +\+.*$"
lines = text.split("\n") lines = text.split("\n")
if re.match(pattern, lines[-1]): if re.match(new_comment_pattern, lines[-1]):
lines = lines[:-1] lines = lines[:-1]
return "\n".join([line.strip() for line in lines]).strip() return "\n".join([line.strip() for line in lines]).strip()