Fix URL regex and load tokenizer locally.
parent
db9d8a706e
commit
b235222ac2
11
utils.py
11
utils.py
|
@ -8,9 +8,12 @@ from config import config
|
||||||
from maxsubstring import longest_common_substring
|
from maxsubstring import longest_common_substring
|
||||||
|
|
||||||
URL_REGEX = (
|
URL_REGEX = (
|
||||||
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
|
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
|
||||||
|
)
|
||||||
|
|
||||||
|
tokenizer = GPTNeoXTokenizerFast.from_pretrained(
|
||||||
|
f"{config['data_dir']}/mpt-30b-drama-ba678"
|
||||||
)
|
)
|
||||||
tokenizer = GPTNeoXTokenizerFast.from_pretrained("mosaicml/mpt-7b")
|
|
||||||
|
|
||||||
|
|
||||||
def remove_notifications(text):
|
def remove_notifications(text):
|
||||||
|
@ -165,8 +168,8 @@ def extract_reply(text):
|
||||||
- End with the start of a new comment: `float-trip +10`
|
- End with the start of a new comment: `float-trip +10`
|
||||||
For the latter case, drop the last line.
|
For the latter case, drop the last line.
|
||||||
"""
|
"""
|
||||||
pattern = r"^ *[\w-]* +\+.*$"
|
new_comment_pattern = r"^ *[\w-]* +\+.*$"
|
||||||
lines = text.split("\n")
|
lines = text.split("\n")
|
||||||
if re.match(pattern, lines[-1]):
|
if re.match(new_comment_pattern, lines[-1]):
|
||||||
lines = lines[:-1]
|
lines = lines[:-1]
|
||||||
return "\n".join([line.strip() for line in lines]).strip()
|
return "\n".join([line.strip() for line in lines]).strip()
|
||||||
|
|
Loading…
Reference in New Issue