Skip to content

Commit 9844a91

Browse files
authored
add the chinese task for the taskflow
add the chinese task for the taskflow
2 parents d51d5d4 + c09d901 commit 9844a91

22 files changed

+2882
-729
lines changed

examples/language_model/gpt/export_model.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,11 @@ def main():
5858
model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
5959

6060
# Suild model and load trained parameters
61-
model = model_class.from_pretrained(args.model_path, max_predict_len=32)
6261
tokenizer = tokenizer_class.from_pretrained(args.model_path)
62+
model = model_class.from_pretrained(
63+
args.model_path,
64+
max_predict_len=32,
65+
eol_token_id=self.tokenizer.eol_token_id)
6366
# Switch to eval model
6467
model.eval()
6568
# Convert to static graph with specific input description

examples/language_model/gpt/predict.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,9 @@ def __init__(self,
4040
self.tokenizer = tokenizer_class.from_pretrained(model_name_or_path)
4141
logger.info('Loading the model parameters, please wait...')
4242
self.model = model_class.from_pretrained(
43-
model_name_or_path, max_predict_len=max_predict_len)
43+
model_name_or_path,
44+
max_predict_len=max_predict_len,
45+
eol_token_id=self.tokenizer.eol_token_id)
4446
self.model.eval()
4547
logger.info('Model loaded.')
4648

@@ -49,7 +51,7 @@ def predict(self, text):
4951
ids = self.tokenizer(text)["input_ids"]
5052
input_ids = paddle.to_tensor(
5153
np.array(ids).reshape(1, -1).astype('int64'))
52-
out = self.model(input_ids, self.tokenizer.eol_token_id)
54+
out = self.model(input_ids)
5355
out = [int(x) for x in out.numpy().reshape([-1])]
5456
logger.info(self.tokenizer.convert_ids_to_string(out))
5557

examples/text_to_knowledge/ernie-ctm/predictor.py

Lines changed: 0 additions & 1 deletion
This file was deleted.

examples/text_to_knowledge/ernie-ctm/predictor.py

Lines changed: 461 additions & 0 deletions
Large diffs are not rendered by default.

examples/text_to_knowledge/wordtag/README.md

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,9 @@ Term-Linking示例程序可以对无标签数据可以启动模型预测, 例如
5454

5555
执行下面的脚本即可快速获取上面两段文本的百科知识树链接的结果
5656

57-
```bash
58-
from paddlenlp.taskflow import TaskFlow
59-
task = TaskFlow("text2knowledge", model="wordtag")
57+
```python
58+
from paddlenlp import Taskflow
59+
task = Taskflow("text2knowledge", model="wordtag")
6060
task(["热梅茶是一道以梅子为主要原料制作的茶饮",
6161
"《孤女》是2010年九州出版社出版的小说,作者是余兼羽"])
6262
# Support the input text directly
@@ -70,6 +70,14 @@ task("热梅茶是一道以梅子为主要原料制作的茶饮")
7070
{'text': '热梅茶是一道以梅子为主要原料制作的茶饮', 'items': [{'item': '热梅茶', 'offset': 0, 'wordtag_label': '饮食类_饮品', 'length': 3}, {'item': '是', 'offset': 3, 'wordtag_label': '肯定词', 'length': 1, 'termid': '肯定否定词_cb_是'}, {'item': '一道', 'offset': 4, 'wordtag_label': '数量词', 'length': 2}, {'item': '以', 'offset': 6, 'wordtag_label': '介词', 'length': 1, 'termid': '介词_cb_以'}, {'item': '梅子', 'offset': 7, 'wordtag_label': '饮食类', 'length': 2, 'termid': '饮食_cb_梅'}, {'item': '为', 'offset': 9, 'wordtag_label': '肯定词', 'length': 1, 'termid': '肯定否定词_cb_为'}, {'item': '主要原料', 'offset': 10, 'wordtag_label': '物体类', 'length': 4, 'termid': '物品_cb_主要原料'}, {'item': '制作', 'offset': 14, 'wordtag_label': '场景事件', 'length': 2, 'termid': '场景事件_cb_制作'}, {'item': '的', 'offset': 16, 'wordtag_label': '助词', 'length': 1, 'termid': '助词_cb_的'}, {'item': '茶饮', 'offset': 17, 'wordtag_label': '饮食类_饮品', 'length': 2, 'termid': '饮品_cb_茶饮'}]}
7171
```
7272

73+
同时我们也提供了基于上述taskflow的python执行脚本,具体的执行方式如下:
74+
```shell
75+
python predict.py --max_seq_len 128 --batch_size 2
76+
```
77+
其中参数释义如下:
78+
- `max_seq_len` 表示最大句子长度,超过该长度将被截断。
79+
- `batch_size` 表示每个预测批次的样本数目。
80+
7381
## WordTag后续计划
7482

7583
1. 持续优化知识标注模型,获得更加精准的标注结果;

examples/text_to_knowledge/wordtag/predict.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,13 @@
1616
import argparse
1717

1818
import paddle
19-
20-
from predictor import WordtagPredictor
19+
from paddlenlp import Taskflow
2120

2221

2322
def parse_args():
2423
parser = argparse.ArgumentParser()
2524

2625
# yapf: disable
27-
parser.add_argument("--data_dir", default="./data", type=str, help="The input data dir, should contain [train/test].json and [train/test]_metrics.json .")
2826
parser.add_argument("--max_seq_len", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.", )
2927
parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.", )
3028
parser.add_argument("--device", default="gpu", type=str, choices=["cpu", "gpu", "xpu"] ,help="The device to select to train the model, is must be cpu/gpu/xpu.")
@@ -36,10 +34,13 @@ def parse_args():
3634

3735
def do_predict(args):
3836
paddle.set_device(args.device)
39-
predictor = WordtagPredictor(term_linking=True)
37+
wordtag = Taskflow(
38+
"text2knowledge",
39+
model="wordtag",
40+
batch_size=args.batch_size,
41+
max_seq_length=args.max_seq_len)
4042
txts = ["《孤女》是2010年九州出版社出版的小说,作者是余兼羽。", "热梅茶是一道以梅子为主要原料制作的茶饮"]
41-
42-
res = predictor.run(txts)
43+
res = wordtag(txts)
4344
print(res)
4445

4546

0 commit comments

Comments
 (0)