ml-finetuning/gpt-j/preprocess_comments.py

57 lines
1.4 KiB
Python

import numpy as np
class Comment:
by_id = {}
def __init__(self, line):
parts = line.split("\t")
self.comment_id = int(parts[0])
self.parent_id = int(parts[2]) if parts[2] != "\\N" else None
self.author_id = int(parts[1])
self.body = parts[3].replace("\\r\\n", "\n").replace("\\n", "\n").strip()
self.parent = None
self.children = []
self.by_id[self.comment_id] = self
with open("drama.csv") as f:
comments = [Comment(l) for l in f.readlines()]
for c in comments:
if c.parent_id:
if c.parent_id in Comment.by_id:
c.parent = Comment.by_id[c.parent_id]
c.parent.children.append(c)
else:
c.parent_id = -1
def render_threads(c):
threads = []
# Use a more unique separator in the future that doesn't conflict with Markdown.
text = f"--- {c.author_id} ---\n{c.body}\n\n"
if not c.children:
return [text]
for child in c.children:
child_threads = render_threads(child)
for t in child_threads:
threads.append(text + t)
return threads
threads = []
for c in comments:
if c.parent_id is None and c.author_id != 261 and c.children:
threads += [t.strip() for t in render_threads(c)]
np.random.shuffle(threads)
with open("train.txt", "w") as f:
f.write("<|endoftext|>".join(threads))