ml-finetuning/gpt-j/preprocess_comments.py

import numpy as np


class Comment:
    by_id = {}

    def __init__(self, line):
        parts = line.split("\t")
        self.comment_id = int(parts[0])
        self.parent_id = int(parts[2]) if parts[2] != "\\N" else None
        self.author_id = int(parts[1])
        self.body = parts[3].replace("\\r\\n", "\n").replace("\\n", "\n").strip()

        self.parent = None
        self.children = []

        self.by_id[self.comment_id] = self


with open("drama.csv") as f:
    comments = [Comment(l) for l in f.readlines()]

for c in comments:
    if c.parent_id:
        if c.parent_id in Comment.by_id:
            c.parent = Comment.by_id[c.parent_id]
            c.parent.children.append(c)
        else:
            c.parent_id = -1


def render_threads(c):
    threads = []
    # Use a more unique separator in the future that doesn't conflict with Markdown.
    text = f"--- {c.author_id} ---\n{c.body}\n\n"

    if not c.children:
        return [text]

    for child in c.children:
        child_threads = render_threads(child)
        for t in child_threads:
            threads.append(text + t)

    return threads


threads = []
for c in comments:
    if c.parent_id is None and c.author_id != 261 and c.children:
        threads += [t.strip() for t in render_threads(c)]

np.random.shuffle(threads)

with open("train.txt", "w") as f:
    f.write("<|endoftext|>".join(threads))