
@ 0,0 +1,56 @@


import numpy as np






class Comment:


by_id = {}




def __init__(self, line):


parts = line.split("\t")


self.comment_id = int(parts[0])


self.parent_id = int(parts[2]) if parts[2] != "\\N" else None


self.author_id = int(parts[1])


self.body = parts[3].replace("\\r\\n", "\n").replace("\\n", "\n").strip()




self.parent = None


self.children = []




self.by_id[self.comment_id] = self






with open("drama.csv") as f:


comments = [Comment(l) for l in f.readlines()]




for c in comments:


if c.parent_id:


if c.parent_id in Comment.by_id:


c.parent = Comment.by_id[c.parent_id]


c.parent.children.append(c)


else:


c.parent_id = 1






def render_threads(c):


threads = []


# Use a more unique separator in the future that doesn't conflict with Markdown.


text = f" {c.author_id} \n{c.body}\n\n"




if not c.children:


return [text]




for child in c.children:


child_threads = render_threads(child)


for t in child_threads:


threads.append(text + t)




return threads






threads = []


for c in comments:


if c.parent_id is None and c.author_id != 261 and c.children:


threads += [t.strip() for t in render_threads(c)]




np.random.shuffle(threads)




with open("train.txt", "w") as f:


f.write("<endoftext>".join(threads))


@ 0,0 +1,3 @@


* [Finetuning guide](https://github.com/kingoflolz/meshtransformerjax/blob/master/howto_finetune.md)


* [Updated config instructions for the TPU machine](https://github.com/kingoflolz/meshtransformerjax/issues/202#issuecomment1050887576)


* Fits on a free tier Oracle Cloud machine with 24GB RAM and 16GB of swap.


@ 0,0 +1,3 @@


