lmops
1import datasets2import os3import re4
5dataset = datasets.load_dataset('openwebtext', split='train')6
7os.makedirs("data/openwebtext", exist_ok=True)8
9num = 010with open("data/openwebtext/data.txt", "w") as f:11for data in dataset:12f.write(re.sub(r"\n+", "<@x(x!>", data['text']) + "\n")13num += 114
15print("Number of lines:", num)