math
1"""
2
3Clean GPT-2 merges file, removing all tokens from the tokenizer that have
4digits, other than the "0" - "9" tokens.
5
6"""
7
8merges_fname = "merges_gpt2.txt"9new_merges_fname = "merges_gpt2_single_digit_numbers.txt"10
11def hasNumbers(inputString):12return any(char.isdigit() for char in inputString)13
14with open(new_merges_fname, 'w') as f_new:15with open(merges_fname, 'r') as f:16lines = f.read().split("\n")17for l in lines:18if len(l) < 1:19break20
21left, right = l.split(" ")22if hasNumbers(left) or hasNumbers(right):23print(left, right)24else:25f_new.write(l + "\n")26
27
28