@@ -64,7 +64,7 @@ def forward(self, input_ids, position_ids=None):
6464
6565class DistilBertPretrainedModel (PretrainedModel ):
6666 """
67- An abstract class for pretrained DistilBERT models. It provides DistilBERT related
67+ An abstract class for pretrained DistilBert models. It provides DistilBert related
6868 `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
6969 `pretrained_init_configuration`, `base_model_prefix` for downloading and
7070 loading pretrained models. See `PretrainedModel` for more details.
@@ -131,6 +131,62 @@ def init_weights(self, layer):
131131
132132@register_base_model
133133class DistilBertModel (DistilBertPretrainedModel ):
134+ """
135+ The bare DistilBert Model transformer outputting raw hidden-states without any specific head on top.
136+
137+ This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
138+ Refer to the superclass documentation for the generic methods.
139+
140+ This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
141+ /docs/en/api/paddle/fluid/dygraph/layers/Layer_en.html>`__ subclass. Use it as a regular Paddle Layer
142+ and refer to the Paddle documentation for all matter related to general usage and behavior.
143+
144+ Args:
145+ vocab_size (int):
146+ Vocabulary size of `inputs_ids` in `DistilBertModel`. Defines the number of different tokens that can
147+ be represented by the `inputs_ids` passed when calling `DistilBertModel`.
148+ hidden_size (int, optional):
149+ Dimensionality of the embedding layer, encoder layers and the pooler layer. Defaults to `768`.
150+ num_hidden_layers (int, optional):
151+ Number of hidden layers in the Transformer encoder. Defaults to `12`.
152+ num_attention_heads (int, optional):
153+ Number of attention heads for each attention layer in the Transformer encoder.
154+ Defaults to `12`.
155+ intermediate_size (int, optional):
156+ Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors
157+ to ff layers are firstly projected from `hidden_size` to `intermediate_size`,
158+ and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.
159+ Defaults to `3072`.
160+ hidden_act (str, optional):
161+ The non-linear activation function in the feed-forward layer.
162+ ``"gelu"``, ``"relu"`` and any other paddle supported activation functions
163+ are supported. Defaults to `"gelu"`.
164+ hidden_dropout_prob (float, optional):
165+ The dropout probability for all fully connected layers in the embeddings and encoder.
166+ Defaults to `0.1`.
167+ attention_probs_dropout_prob (float, optional):
168+ The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target.
169+ Defaults to `0.1`.
170+ max_position_embeddings (int, optional):
171+ The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input
172+ sequence. Defaults to `512`.
173+ type_vocab_size (int, optional):
174+ The vocabulary size of `token_type_ids`.
175+ Defaults to `16`.
176+ initializer_range (float, optional):
177+ The standard deviation of the normal initializer.
178+ Defaults to `0.02`.
179+
180+ .. note::
181+ A normal_initializer initializes weight matrices as normal distributions.
182+ See :meth:`DistilBertPretrainedModel.init_weights()` for how weights are initialized in `DistilBertModel`.
183+
184+ pad_token_id (int, optional):
185+ The index of padding token in the token vocabulary.
186+ Defaults to `0`.
187+
188+ """
189+
134190 def __init__ (self ,
135191 vocab_size ,
136192 hidden_size = 768 ,
@@ -162,6 +218,44 @@ def __init__(self,
162218 self .apply (self .init_weights )
163219
164220 def forward (self , input_ids , attention_mask = None ):
221+ r'''
222+ The DistilBertModel forward method, overrides the `__call__()` special method.
223+
224+ Args:
225+ input_ids (Tensor):
226+ Indices of input sequence tokens in the vocabulary. They are
227+ numerical representations of tokens that build the input sequence.
228+ Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
229+ attention_mask (Tensor, optional):
230+ Mask used in multi-head attention to avoid performing attention to some unwanted positions,
231+ usually the paddings or the subsequent positions.
232+ Its data type can be int, float and bool.
233+ When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
234+ When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
235+ When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
236+ It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
237+ For example, its shape can be [batch_size, sequence_length], [batch_size, sequence_length, sequence_length],
238+ [batch_size, num_attention_heads, sequence_length, sequence_length].
239+ Defaults to `None`, which means nothing needed to be prevented attention to.
240+
241+ Returns:
242+ Tensor: Returns tensor `encoder_output`, which means the sequence of hidden-states at the last layer of the model.
243+ Its data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
244+
245+ Example:
246+ .. code-block::
247+
248+ import paddle
249+ from paddlenlp.transformers import DistilBertModel, DistilBertTokenizer
250+
251+ tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
252+ model = DistilBertModel.from_pretrained('distilbert-base-uncased')
253+
254+ inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
255+ inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
256+ output = model(**inputs)
257+ '''
258+
165259 if attention_mask is None :
166260 attention_mask = paddle .unsqueeze (
167261 (input_ids == self .pad_token_id
@@ -174,6 +268,21 @@ def forward(self, input_ids, attention_mask=None):
174268
175269
176270class DistilBertForSequenceClassification (DistilBertPretrainedModel ):
271+ """
272+ DistilBert Model with a linear layer on top of the output layer, designed for
273+ sequence classification/regression tasks like GLUE tasks.
274+
275+ Args:
276+ distilbert (:class:`DistilBertModel`):
277+ An instance of DistilBertModel.
278+ num_classes (int, optional):
279+ The number of classes. Defaults to `2`.
280+ dropout (float, optional):
281+ The dropout probability for output of DistilBert.
282+ If None, use the same value as `hidden_dropout_prob` of `DistilBertModel`
283+ instance `distilbert`. Defaults to None.
284+ """
285+
177286 def __init__ (self , distilbert , num_classes = 2 , dropout = None ):
178287 super (DistilBertForSequenceClassification , self ).__init__ ()
179288 self .num_classes = num_classes
@@ -188,6 +297,36 @@ def __init__(self, distilbert, num_classes=2, dropout=None):
188297 self .apply (self .init_weights )
189298
190299 def forward (self , input_ids , attention_mask = None ):
300+ r"""
301+ The DistilBertForSequenceClassification forward method, overrides the __call__() special method.
302+
303+ Args:
304+ input_ids (Tensor):
305+ See :class:`DistilBertModel`.
306+ attention_mask (list, optional):
307+ See :class:`DistilBertModel`.
308+
309+ Returns:
310+ Tensor: Returns tensor `logits`, a tensor of the input text classification logits.
311+ Shape as `[batch_size, num_classes]` and dtype as `float32`.
312+
313+ Example:
314+ .. code-block::
315+
316+ import paddle
317+ from paddlenlp.transformers.distilbert.modeling import DistilBertForSequenceClassification
318+ from paddlenlp.transformers.distilbert.tokenizer import DistilBertTokenizer
319+
320+ tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
321+ model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
322+
323+ inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
324+ inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
325+ outputs = model(**inputs)
326+
327+ logits = outputs[0]
328+ """
329+
191330 distilbert_output = self .distilbert (
192331 input_ids = input_ids , attention_mask = attention_mask )
193332
@@ -202,6 +341,19 @@ def forward(self, input_ids, attention_mask=None):
202341
203342
204343class DistilBertForQuestionAnswering (DistilBertPretrainedModel ):
344+ """
345+ DistilBert Model with a linear layer on top of the hidden-states output to
346+ compute `span_start_logits` and `span_end_logits`, designed for question-answering tasks like SQuAD.
347+
348+ Args:
349+ distilbert (:class:`DistilBertModel`):
350+ An instance of DistilBertModel.
351+ dropout (float, optional):
352+ The dropout probability for output of DistilBert.
353+ If None, use the same value as `hidden_dropout_prob` of `DistilBertModel`
354+ instance `distilbert`. Defaults to None.
355+ """
356+
205357 def __init__ (self , distilbert , dropout = None ):
206358 super (DistilBertForQuestionAnswering , self ).__init__ ()
207359 self .distilbert = distilbert # allow bert to be config
@@ -211,6 +363,46 @@ def __init__(self, distilbert, dropout=None):
211363 self .apply (self .init_weights )
212364
213365 def forward (self , input_ids , attention_mask = None ):
366+ r"""
367+ The DistilBertForQuestionAnswering forward method, overrides the __call__() special method.
368+
369+ Args:
370+ input_ids (Tensor):
371+ See :class:`DistilBertModel`.
372+ attention_mask (list, optional):
373+ See :class:`DistilBertModel`.
374+
375+ Returns:
376+ tuple: Returns tuple (`start_logits`, `end_logits`).
377+
378+ With the fields:
379+
380+ - start_logits(Tensor):
381+ A tensor of the input token classification logits, indicates the start position of the labelled span.
382+ Its data type should be float32 and its shape is [batch_size, sequence_length].
383+
384+ - end_logits(Tensor):
385+ A tensor of the input token classification logits, indicates the end position of the labelled span.
386+ Its data type should be float32 and its shape is [batch_size, sequence_length].
387+
388+ Example:
389+ .. code-block::
390+
391+ import paddle
392+ from paddlenlp.transformers.distilbert.modeling import DistilBertForQuestionAnswering
393+ from paddlenlp.transformers.distilbert.tokenizer import DistilBertTokenizer
394+
395+ tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
396+ model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
397+
398+ inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
399+ inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
400+ outputs = model(**inputs)
401+
402+ start_logits = outputs[0]
403+ end_logits =outputs[1]
404+ """
405+
214406 sequence_output = self .distilbert (
215407 input_ids , attention_mask = attention_mask )
216408 sequence_output = self .dropout (sequence_output )
@@ -221,6 +413,21 @@ def forward(self, input_ids, attention_mask=None):
221413
222414
223415class DistilBertForTokenClassification (DistilBertPretrainedModel ):
416+ """
417+ DistilBert Model with a linear layer on top of the hidden-states output layer,
418+ designed for token classification tasks like NER tasks.
419+
420+ Args:
421+ distilbert (:class:`DistilBertModel`):
422+ An instance of DistilBertModel.
423+ num_classes (int, optional):
424+ The number of classes. Defaults to `2`.
425+ dropout (float, optional):
426+ The dropout probability for output of DistilBert.
427+ If None, use the same value as `hidden_dropout_prob` of `DistilBertModel`
428+ instance `distilbert`. Defaults to None.
429+ """
430+
224431 def __init__ (self , distilbert , num_classes = 2 , dropout = None ):
225432 super (DistilBertForTokenClassification , self ).__init__ ()
226433 self .num_classes = num_classes
@@ -232,6 +439,36 @@ def __init__(self, distilbert, num_classes=2, dropout=None):
232439 self .apply (self .init_weights )
233440
234441 def forward (self , input_ids , attention_mask = None ):
442+ r"""
443+ The DistilBertForTokenClassification forward method, overrides the __call__() special method.
444+
445+ Args:
446+ input_ids (Tensor):
447+ See :class:`DistilBertModel`.
448+ attention_mask (list, optional):
449+ See :class:`DistilBertModel`.
450+
451+ Returns:
452+ Tensor: Returns tensor `logits`, a tensor of the input token classification logits.
453+ Shape as `[batch_size, sequence_length, num_classes]` and dtype as `float32`.
454+
455+ Example:
456+ .. code-block::
457+
458+ import paddle
459+ from paddlenlp.transformers.distilbert.modeling import DistilBertForTokenClassification
460+ from paddlenlp.transformers.distilbert.tokenizer import DistilBertTokenizer
461+
462+ tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
463+ model = DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased')
464+
465+ inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
466+ inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
467+ outputs = model(**inputs)
468+
469+ logits = outputs[0]
470+ """
471+
235472 sequence_output = self .distilbert (
236473 input_ids , attention_mask = attention_mask )
237474
@@ -241,6 +478,14 @@ def forward(self, input_ids, attention_mask=None):
241478
242479
243480class DistilBertForMaskedLM (DistilBertPretrainedModel ):
481+ """
482+ DistilBert Model with a `language modeling` head on top.
483+
484+ Args:
485+ distilbert (:class:`DistilBertModel`):
486+ An instance of DistilBertModel.
487+ """
488+
244489 def __init__ (self , distilbert ):
245490 super (DistilBertForMaskedLM , self ).__init__ ()
246491 self .distilbert = distilbert
@@ -255,6 +500,33 @@ def __init__(self, distilbert):
255500 self .apply (self .init_weights )
256501
257502 def forward (self , input_ids = None , attention_mask = None ):
503+ r'''
504+ The DistilBertForMaskedLM forward method, overrides the `__call__()` special method.
505+
506+ Args:
507+ input_ids (Tensor):
508+ See :class:`DistilBertModel`.
509+ attention_mask (Tensor, optional):
510+ See :class:`DistilBertModel`.
511+
512+ Returns:
513+ Tensor: Returns tensor `prediction_logits`, the scores of masked token prediction.
514+ Its data type should be float32 and its shape is [batch_size, sequence_length, vocab_size].
515+
516+ Example:
517+ .. code-block::
518+
519+ import paddle
520+ from paddlenlp.transformers import DistilBertForMaskedLM, DistilBertTokenizer
521+
522+ tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
523+ model = DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')
524+
525+ inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
526+ inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
527+ prediction_logits = model(**inputs)
528+ '''
529+
258530 distilbert_output = self .distilbert (
259531 input_ids = input_ids , attention_mask = attention_mask )
260532 prediction_logits = self .vocab_transform (distilbert_output )
0 commit comments