lingvo/lingvo/tasks/lm/params/one_billion_wds.py at master · tensorflow/lingvo

History

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

#

# Licensed under the Apache License, Version 2.0 (the "License");

# you may not use this file except in compliance with the License.

# You may obtain a copy of the License at

#

# http://www.apache.org/licenses/LICENSE-2.0

#

# Unless required by applicable law or agreed to in writing, software

# distributed under the License is distributed on an "AS IS" BASIS,

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

# See the License for the specific language governing permissions and

# limitations under the License.

# ==============================================================================

"""Train word-level LMs on 1 Billion Words benchmark data."""

import os

from lingvo import model_registry

from lingvo.core import base_model_params

from lingvo.core import layers

from lingvo.core import optimizer

from lingvo.core import py_utils

from lingvo.core import schedule

from lingvo.core import tokenizers

from lingvo.tasks.lm import input_generator as lm_inp

from lingvo.tasks.lm import layers as lm_layers

from lingvo.tasks.lm import model

class WordLevelOneBwdsBase(base_model_params.SingleTaskModelParams):

"""Params for training a word-level LM on One Billion Wds text corpus.

Tries to match https://github.com/rafaljozefowicz/lm.

"""

# Generated using lingvo/tasks/lm/tools:download_lm1b.

CORPUS_DIR = '/tmp/lm1b/1-billion-word-language-modeling-benchmark-r13output/'

# BIG-LSTM model size: embedding/projection dim = 1024; LSTM state dim = 8192

EMBEDDING_DIM = 1024

MAX_TOKENS = 1024

NUM_EMBEDDING_SHARDS = 8

NUM_SAMPLED = 8192

NUM_SOFTMAX_SHARDS = 8

RNN_STATE_DIM = 8192

VOCAB_SIZE = 793472 # includes <epsilon>

WORD_VOCAB = os.path.join(CORPUS_DIR, 'vocab.txt')

def Train(self):

p = lm_inp.LmInput.Params()

p.bucket_upper_bound = [10, 20, 30, 40, 50, 100, 256, 512, 1024]

p.bucket_batch_limit = [1024, 512, 256, 256, 128, 128, 64, 32, 16]

p.file_buffer_size = 10000000

p.file_parallelism = 10

p.file_pattern = 'text:' + os.path.join(

self.CORPUS_DIR, 'training-monolingual.tokenized.shuffled', 'news.en*')

p.name = '1bwds_train_set'

p.tokenizer = tokenizers.VocabFileTokenizer.Params()

p.num_batcher_threads = 16

p.target_max_length = self.MAX_TOKENS

p.tokenizer.target_sos_id = 1

p.tokenizer.target_eos_id = 2

p.tokenizer.target_unk_id = 3

p.tokenizer.token_vocab_filepath = self.WORD_VOCAB

p.tokenizer.vocab_size = self.VOCAB_SIZE

return p

def Dev(self):

p = self.Train()

# Use small batches for eval.

p.bucket_upper_bound = [10, 20, 30, 40, 50, 100, 256, 512, 1024]

p.bucket_batch_limit = [128, 64, 32, 32, 16, 16, 4, 2, 1]

p.file_buffer_size = 1

p.file_parallelism = 1

p.file_pattern = 'text:' + os.path.join(

self.CORPUS_DIR, 'heldout-monolingual.tokenized.shuffled',

'news.en.heldout-00001*')

p.name = '1bwds_dev_set'

p.num_batcher_threads = 1

p.num_samples = 6206 # Number of sentences to evaluate on.

return p

def Test(self):

p = self.Dev()

p.file_pattern = 'text:' + os.path.join(

self.CORPUS_DIR, 'heldout-monolingual.tokenized.shuffled',

'news.en.heldout-00000*')

p.name = '1bwds_test_set'

p.num_samples = 6075 # Number of sentences to evaluate on.

return p

def Task(self):

p = model.LanguageModel.Params()

p.name = '1bwds_word_level_lm'

p.eval.samples_per_summary = 10000

p.lm = lm_layers.RnnLm.CommonParams(

vocab_size=self.VOCAB_SIZE,

emb_dim=self.EMBEDDING_DIM,

num_layers=2,

residual_start=3, # disable residuals

rnn_dims=self.EMBEDDING_DIM,

rnn_hidden_dims=self.RNN_STATE_DIM)

# Input embedding needs to be sharded.

p.lm.emb.max_num_shards = self.NUM_EMBEDDING_SHARDS

p.lm.embedding_dropout_keep_prob = 0.75

# Match the initialization in github code.

p.lm.emb.params_init = py_utils.WeightInit.UniformUnitScaling(

1.0 * self.NUM_EMBEDDING_SHARDS)

# We also want dropout after each of the RNN layers.

p.lm.rnns.dropout.keep_prob = 0.75

# Adjusts training params.

tp = p.train

tp.sum_loss_across_tokens_in_batch = True

# Disable any so called "clipping" (gradient scaling really).

tp.clip_gradient_norm_to_value = 0.0

tp.grad_norm_to_clip_to_zero = 0.0

# Do clip the LSTM gradients.

tp.max_lstm_gradient_norm = 16

# Straight Adagrad; very sensitive to initial accumulator value, the default

# 0.1 value is far from adequate.

# TODO(ciprianchelba): tune accumulator value, learning rate, clipping

# threshold.

tp.learning_rate = 0.2

tp.lr_schedule = (

schedule.PiecewiseConstantSchedule.Params().Set(

boundaries=[], values=[1.0]))

tp.l2_regularizer_weight = None # No regularization.

tp.optimizer = optimizer.Adagrad.Params()

return p

@model_registry.RegisterSingleTaskModel

class WordLevelOneBwdsSimpleSampledSoftmax(WordLevelOneBwdsBase):

"""Use sampled soft-max in training."""

def Task(self):

p = super().Task()

num_input_dim = p.lm.softmax.input_dim

p.lm.softmax = layers.SimpleFullSoftmax.Params()

p.lm.softmax.input_dim = num_input_dim

p.lm.softmax.num_classes = self.VOCAB_SIZE

p.lm.softmax.num_sampled = self.NUM_SAMPLED

p.lm.softmax.num_shards = self.NUM_SOFTMAX_SHARDS

# Match the initialization in github code.

p.lm.softmax.params_init = py_utils.WeightInit.UniformUnitScaling(

1.0 * self.NUM_SOFTMAX_SHARDS)

assert p.lm.softmax.num_classes % p.lm.softmax.num_shards == 0

return p

@model_registry.RegisterSingleTaskModel

class WordLevelOneBwdsSimpleSampledSoftmaxTiny(

WordLevelOneBwdsSimpleSampledSoftmax):

"""Tiny model size for local, debugging runs of the above."""

EMBEDDING_DIM = 7

MAX_TOKENS = 1024

NUM_EMBEDDING_SHARDS = 1

NUM_SAMPLED = 8

NUM_SOFTMAX_SHARDS = 8

RNN_STATE_DIM = 32

# Example large transformer model using GPIPE.

# Instructions:

# trainer --run_locally=gpu --mode=sync \

# --model=lm.one_billion_wds.OneBWdsGPipeTransformerWPM \

# --logdir=/tmp/lm/log --logtostderr --worker_split_size=4 --worker_gpus=4

# Relative throughput on multiple V100s, each with 16GB ram.

# GPUs throughput

# 1 1

# 2 0.93

# 4 0.85

# 8 0.775

@model_registry.RegisterSingleTaskModel

class OneBWdsGPipeTransformerWPM(WordLevelOneBwdsBase):

"""LM using gpipe transformer."""

VOCAB_SIZE = 32000

EMBEDDING_DIM = 2048

BATCH_SIZE = 32

MAX_TOKENS = 1024 # The max sequence length in one example.

# GPIPE related params.

GPUS = 4

# A list of ending index for each split/partition in ascending order.

# For example SPLITS = [8, 16, 24, 32] defined a 32 layer model with 4 splits,

# each of which contains 8 layers.

# The number belows runs on 16GB-V100s. Your mileage may vary.

SPLITS = [8 * (i + 1) for i in range(GPUS)]

LAYERS = SPLITS[-1]

# Set NUM_MICRO_BATCHES >= len(SPLITS) * 4 to minimize gpipe bubble.

NUM_MICRO_BATCHES = 32

def Train(self):

p = super().Train()

# Replace it with your own wordpiece tokenizer.

p.tokenizer = tokenizers.AsciiTokenizer.Params()

p.target_max_length = self.MAX_TOKENS

p.tokenizer.target_sos_id = 1

p.tokenizer.target_eos_id = 2

p.tokenizer.target_unk_id = 0

p.tokenizer.vocab_size = self.VOCAB_SIZE

p.bucket_upper_bound = [self.MAX_TOKENS]

p.bucket_batch_limit = [self.BATCH_SIZE]

p.fixed_input_shape = True

return p

def Dev(self):

p = self.Train()

p.file_pattern = 'text:' + os.path.join(

self.CORPUS_DIR, 'heldout-monolingual.tokenized.shuffled',

'news.en.heldout-00001*')

p.name = '1bwds_dev_set'

p.num_batcher_threads = 1

p.num_samples = 6206 # Number of sentences to evaluate on.

return p

def Test(self):

p = self.Dev()

p.file_pattern = 'text:' + os.path.join(

self.CORPUS_DIR, 'heldout-monolingual.tokenized.shuffled',

'news.en.heldout-00000*')

p.name = '1bwds_test_set'

p.num_samples = 6075 # Number of sentences to evaluate on.

return p

def Task(self):

"""Language model on 1bw dataset using gpipe transformer."""

p = model.BatchMajorLanguageModel.Params()

p.eval.samples_per_summary = 0

p.name = '1bwds_wpm_level_lm'

p.lm = lm_layers.GPipeTransformerLm.CommonParams(

model_dim=self.EMBEDDING_DIM,

vocab_size=self.VOCAB_SIZE,

hidden_dim=self.EMBEDDING_DIM * 4,

num_layers=self.LAYERS,

splits=self.SPLITS,

num_micro_batches=self.NUM_MICRO_BATCHES,

micro_batch_size=self.BATCH_SIZE // self.NUM_MICRO_BATCHES,

num_heads=16,

softmax_max_alloc=128 * (2**20),

atten_dropout_prob=0.1,

residual_dropout_prob=0.1)

p.train.Set(

learning_rate=0.5,

optimizer=optimizer.Adam.ParamsA(),

clip_gradient_norm_to_value=0.0,

grad_norm_to_clip_to_zero=0.0,

lr_schedule=schedule.TransformerSchedule.Params().Set(

warmup_steps=40000, worker_replicas=1,

model_dim=self.EMBEDDING_DIM))

return p

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

one_billion_wds.py

one_billion_wds.py

Files

one_billion_wds.py

Latest commit

History

one_billion_wds.py

File metadata and controls