57 lines
1.4 KiB
Python
57 lines
1.4 KiB
Python
import numpy as np
|
|
|
|
|
|
class Comment:
|
|
by_id = {}
|
|
|
|
def __init__(self, line):
|
|
parts = line.split("\t")
|
|
self.comment_id = int(parts[0])
|
|
self.parent_id = int(parts[2]) if parts[2] != "\\N" else None
|
|
self.author_id = int(parts[1])
|
|
self.body = parts[3].replace("\\r\\n", "\n").replace("\\n", "\n").strip()
|
|
|
|
self.parent = None
|
|
self.children = []
|
|
|
|
self.by_id[self.comment_id] = self
|
|
|
|
|
|
with open("drama.csv") as f:
|
|
comments = [Comment(l) for l in f.readlines()]
|
|
|
|
for c in comments:
|
|
if c.parent_id:
|
|
if c.parent_id in Comment.by_id:
|
|
c.parent = Comment.by_id[c.parent_id]
|
|
c.parent.children.append(c)
|
|
else:
|
|
c.parent_id = -1
|
|
|
|
|
|
def render_threads(c):
|
|
threads = []
|
|
# Use a more unique separator in the future that doesn't conflict with Markdown.
|
|
text = f"--- {c.author_id} ---\n{c.body}\n\n"
|
|
|
|
if not c.children:
|
|
return [text]
|
|
|
|
for child in c.children:
|
|
child_threads = render_threads(child)
|
|
for t in child_threads:
|
|
threads.append(text + t)
|
|
|
|
return threads
|
|
|
|
|
|
threads = []
|
|
for c in comments:
|
|
if c.parent_id is None and c.author_id != 261 and c.children:
|
|
threads += [t.strip() for t in render_threads(c)]
|
|
|
|
np.random.shuffle(threads)
|
|
|
|
with open("train.txt", "w") as f:
|
|
f.write("<|endoftext|>".join(threads))
|