Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 79 additions & 23 deletions datasets/openwebtext/create_tfrecords.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,49 @@
import numpy as np
import tensorflow as tf
from tqdm import tqdm
from absl import app
from absl import flags

import encoder

base_dir = "/home/connor/2/newspaper" # Path to where your .txt files are located
files_per = 175000 # 175000 ~ 200-300MB
name = "openwebtext-newspaper" # Name of output files will be name_i.tfrecords where i is the number of the file
output_dir = "/home/connor/out"
log_dir = "logs"
files = glob.glob(os.path.join(base_dir, "**/*.txt"))
processes = 64 # Number of encoding processes to run
encoder_path = "gs://openwebtext/stuff/encoder" # Path to encoder files
minimum_size = 25
FLAGS = flags.FLAGS


flags.DEFINE_string(
"base_dir",
default="/home/connor/2/newspaper",
help="Path to where your .txt files are located.")


flags.DEFINE_string(
"output_dir",
default="/home/connor/out",
help="destination dir for tfrecords"
)

flags.DEFINE_string(
"encoder_path",
default="gs://openwebtext/stuff/encoder" ,
help="Path to encoder files")


flags.DEFINE_string(
"name",
default="openwebtext-newspaper",
help="Name of output files will be name_i.tfrecords where i is the number of the file")


flags.DEFINE_integer("processes",
default=64,
help="Number of encoding processes to run")
flags.DEFINE_integer("minimum_size",
default=25,
help="minimum text size"
)
flags.DEFINE_integer("files_per",
default=1500,
help="file chunk size")


def _int64_feature(value):
"""Returns an int64_list from a bool / enum / int / uint."""
Expand All @@ -38,11 +69,6 @@ def chunks(l, n):
if not os.path.exists(log_dir):
os.mkdir(log_dir)

enc = encoder.get_encoder(encoder_path)

file_chunks = chunks(files, files_per)

print("Got {} files, divided into {} chunks.".format(str(len(files)), str(len(file_chunks))))

def create_file(args):
i, chunk = args
Expand All @@ -56,7 +82,7 @@ def create_file(args):
good_files = 0
current = None
for fn in chunk:
with tf.gfile.Open(fn, "r") as f:
with tf.io.gfile.Open(fn, "r") as f:
d = f.read()
d = ftfy.fix_text(d, normalization='NFKC')
data = np.array(enc.encode(d), np.int32)
Expand All @@ -78,12 +104,42 @@ def create_file(args):

return good_files

start = time.time()
pool = Pool(processes=processes)
good = 0
for g in tqdm(pool.imap(create_file, enumerate(file_chunks)), total=len(file_chunks)):
good += g

end = time.time()

print("Done! In {:.2f}s, {} / {} good files.".format(end-start, str(good), str(len(files))))
def main(argv ):
global enc
global files

base_dir = FLAGS.base_dir ,# Path to where your .txt files are located
files_per =FLAGS.files_per ,# 175000 ~ 200-300MB
name = FLAGS.name, # Name of output files will be name_i.tfrecords where i is the number of the file
output_dir = FLAGS.output_dir,
log_dir = FLAGS.log_dir,
processes = FLAGS.processes, # Number of encoding processes to run
encoder_path =FLAGS.encoder_path ,# Path to encoder files
minimum_size = FLAGS.minimum_size




print(base_dir)
files = glob.glob(os.path.join(base_dir[0], "/**/*.txt"), recursive=True)


enc = encoder.get_encoder(encoder_path)

file_chunks = chunks(files, files_per)

print("Got {} files, divided into {} chunks.".format(str(len(files)), str(len(file_chunks))))

start = time.time()
pool = Pool(processes=processes)
good = 0
for g in tqdm(pool.imap(create_file, enumerate(file_chunks)), total=len(file_chunks)):
good += g

end = time.time()

print("Done! In {:.2f}s, {} / {} good files.".format(end-start, str(good), str(len(files))))

if __name__ == '__main__':
app.run(main)
6 changes: 3 additions & 3 deletions datasets/openwebtext/encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,12 +107,12 @@ def decode(self, tokens):
return text

def get_encoder(encoder_path):
with tf.gfile.Open(os.path.join(encoder_path, 'encoder.json'), 'r') as f:
with tf.io.gfile.GFile(os.path.join(encoder_path, 'encoder.json'), 'r') as f:
encoder = json.load(f)
with tf.gfile.Open(os.path.join(encoder_path, 'vocab.bpe'), 'r') as f: # utf-8?
with tf.io.gfile.GFile(os.path.join(encoder_path, 'vocab.bpe'), 'r') as f: # utf-8?
bpe_data = f.read()
bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
return Encoder(
encoder=encoder,
bpe_merges=bpe_merges,
)
)