diff --git a/utils.py b/utils.py index 5a3341f..685185f 100644 --- a/utils.py +++ b/utils.py @@ -8,9 +8,12 @@ from config import config from maxsubstring import longest_common_substring URL_REGEX = ( - r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", + r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+" +) + +tokenizer = GPTNeoXTokenizerFast.from_pretrained( + f"{config['data_dir']}/mpt-30b-drama-ba678" ) -tokenizer = GPTNeoXTokenizerFast.from_pretrained("mosaicml/mpt-7b") def remove_notifications(text): @@ -165,8 +168,8 @@ def extract_reply(text): - End with the start of a new comment: `float-trip +10` For the latter case, drop the last line. """ - pattern = r"^ *[\w-]* +\+.*$" + new_comment_pattern = r"^ *[\w-]* +\+.*$" lines = text.split("\n") - if re.match(pattern, lines[-1]): + if re.match(new_comment_pattern, lines[-1]): lines = lines[:-1] return "\n".join([line.strip() for line in lines]).strip()