Skip to content

Commit 4d47ed9

Browse files
texasmichelleSeqIO
authored andcommitted
Internal change.
PiperOrigin-RevId: 592356708
1 parent ad99fd3 commit 4d47ed9

1 file changed

Lines changed: 188 additions & 0 deletions

File tree

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
# Copyright 2023 The SeqIO Authors.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""Microbenchmarks for SeqIO preprocessors functions."""
16+
17+
import os
18+
19+
import google_benchmark
20+
from seqio import dataset_providers
21+
from seqio import feature_converters
22+
from seqio import preprocessors
23+
from seqio import test_utils
24+
from seqio import vocabularies
25+
import tensorflow.compat.v2 as tf
26+
27+
28+
Feature = dataset_providers.Feature
29+
30+
_TEST_DIR = os.path.join(
31+
os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'test_data'
32+
)
33+
_SENTENCEPIECE_VOCAB = vocabularies.SentencePieceVocabulary(
34+
os.path.join(_TEST_DIR, 'sentencepiece', 'sentencepiece.model')
35+
)
36+
_OUTPUT_FEATURES = {
37+
'prefix': Feature(_SENTENCEPIECE_VOCAB, add_eos=True),
38+
'suffix': Feature(_SENTENCEPIECE_VOCAB, add_eos=False),
39+
}
40+
41+
42+
@google_benchmark.register
43+
def rekey(state):
44+
og_dataset = tf.data.Dataset.from_tensors(
45+
{'text': 'That is good.', 'other': 'That is bad.'}
46+
)
47+
while state:
48+
_ = preprocessors.rekey(og_dataset, {'inputs': 'other', 'targets': 'text'})
49+
50+
51+
@google_benchmark.register
52+
def tokenize(state):
53+
og_dataset = tf.data.Dataset.from_tensors(
54+
{'prefix': 'This is', 'suffix': 'a test.'}
55+
)
56+
while state:
57+
preprocessors.tokenize(og_dataset, output_features=_OUTPUT_FEATURES)
58+
59+
60+
@google_benchmark.register
61+
def tokenize_3_rank(state):
62+
og_dataset = tf.data.Dataset.from_tensors({
63+
'prefix': tf.ragged.constant(
64+
[[['a', 'b'], ['c']], [['d', 'e'], ['f']], [['g', 'h'], ['i']]]
65+
),
66+
'suffix': tf.ragged.constant(
67+
[[['j'], ['k', 'l', 'm']], [['n'], ['o', 'p']]]
68+
),
69+
})
70+
while state:
71+
preprocessors.tokenize(og_dataset, output_features=_OUTPUT_FEATURES)
72+
73+
74+
@google_benchmark.register
75+
def tokenize_and_append_eos(state):
76+
og_dataset = tf.data.Dataset.from_tensors(
77+
{'prefix': 'This is', 'suffix': 'a test.'}
78+
)
79+
while state:
80+
preprocessors.tokenize_and_append_eos(
81+
og_dataset, output_features=_OUTPUT_FEATURES
82+
)
83+
84+
85+
@google_benchmark.register
86+
def append_eos(state):
87+
"""Microbenchmark for appending EOS."""
88+
og_dataset = tf.data.Dataset.from_tensors({
89+
'inputs': [1, 2, 3],
90+
'targets': [4, 5, 6, 7],
91+
'arrows': [8, 9, 10, 11],
92+
'strings': [[14, 15], [16, 17], [18, 19]],
93+
'feathers': tf.ragged.constant([[20, 21], [], [22, 23, 24, 25, 26]]),
94+
'bows': [12, 13],
95+
})
96+
output_features = {
97+
'inputs': Feature(_SENTENCEPIECE_VOCAB, add_eos=False),
98+
'targets': Feature(_SENTENCEPIECE_VOCAB, add_eos=True),
99+
'arrows': Feature(_SENTENCEPIECE_VOCAB, add_eos=True),
100+
'strings': Feature(_SENTENCEPIECE_VOCAB, add_eos=True),
101+
'feathers': Feature(_SENTENCEPIECE_VOCAB, add_eos=True),
102+
}
103+
while state:
104+
_ = preprocessors.append_eos(og_dataset, output_features)
105+
106+
107+
@google_benchmark.register
108+
def append_eos_after_trim(state):
109+
"""Microbenchmark for appending EOS after trimming."""
110+
og_dataset = tf.data.Dataset.from_tensors({
111+
'inputs': [1, 2, 3],
112+
'targets': [4, 5, 6, 7],
113+
'arrows': [8, 9, 10, 11],
114+
'strings': [[14, 15], [16, 17], [18, 19]],
115+
'feathers': tf.ragged.constant([[20, 21], [], [22, 23, 24, 25, 26]]),
116+
'bows': [12, 13],
117+
})
118+
output_features = {
119+
'inputs': Feature(_SENTENCEPIECE_VOCAB, add_eos=False),
120+
'targets': Feature(_SENTENCEPIECE_VOCAB, add_eos=True),
121+
'arrows': Feature(_SENTENCEPIECE_VOCAB, add_eos=True),
122+
'strings': Feature(_SENTENCEPIECE_VOCAB, add_eos=True),
123+
'feathers': Feature(_SENTENCEPIECE_VOCAB, add_eos=True),
124+
}
125+
sequence_length = {
126+
'inputs': 4,
127+
'targets': 3,
128+
'arrows': 5,
129+
'strings': 3,
130+
'feathers': 4,
131+
}
132+
while state:
133+
_ = preprocessors.append_eos_after_trim(
134+
og_dataset,
135+
output_features=output_features,
136+
sequence_length=sequence_length,
137+
)
138+
139+
140+
@google_benchmark.register
141+
def truncate_inputs_left(state):
142+
og_dataset = tf.data.Dataset.from_tensors({
143+
'inputs': [1, 2, 3],
144+
'targets': [4, 5, 6, 7],
145+
})
146+
sequence_length = {'inputs': 2, 'targets': 4}
147+
while state:
148+
_ = preprocessors.truncate_inputs_left(og_dataset, sequence_length)
149+
150+
151+
@google_benchmark.register
152+
def apply_feature_converter(state):
153+
"""Microbenchmark for applying feature converter."""
154+
x = {'inputs': [8, 7, 1, 0], 'targets': [4, 1, 0], 'redundant_feature': [0]}
155+
ds = test_utils.create_default_dataset(
156+
[x], feature_names=('inputs', 'targets', 'redundant_feature')
157+
)
158+
sequence_length = {'inputs': 8, 'targets': 7}
159+
feature_converter = feature_converters.EncDecFeatureConverter()
160+
while state:
161+
_ = preprocessors.apply_feature_converter(
162+
ds, sequence_length=sequence_length, feature_converter=feature_converter
163+
)
164+
165+
166+
# TODO(b/315985098): Ask mishragaurav@ for a good example and create a test.
167+
# @google_benchmark.register
168+
# def hash_and_tile_subtask_id(state):
169+
# og_dataset = tf.data.Dataset.from_tensors({
170+
# 'inputs': 'This is',
171+
# 'targets': 'a test.',
172+
# 'provenance/task': 'test_task_name',
173+
# })
174+
# while state:
175+
# _ = preprocessors.hash_and_tile_subtask_id(og_dataset)
176+
177+
178+
@google_benchmark.register
179+
def preprocess_tensorflow_examples(state):
180+
og_dataset = tf.data.Dataset.from_tensors({'text': 'Hello', 'label': 'World'})
181+
while state:
182+
_ = preprocessors.preprocess_tensorflow_examples(
183+
og_dataset, 'Input: {text}', 'Output: {label}'
184+
)
185+
186+
187+
if __name__ == '__main__':
188+
google_benchmark.main()

0 commit comments

Comments
 (0)