Skip to content

FATEBoard 多host时上传不同数据集但host数据集仍相同 #5786

@sealofyou

Description

@sealofyou

Describe the bug
FATE1.11.3,多方训练
guest_save_path='/data/projects/fate/fate/upload/315/mnist_train_changeA' host_save_path=['/data/projects/fate/fate/upload/315/mnist_train_changeB', '/data/projects/fate/fate/upload/315/mnist_train_changeC'] guest=9999 host=[10000, 9000] arbiter=9999
其中10000, 9000在FATEBoard的数据集均显示为experiment.mnist_train_changeC

日志如下

# Reader
[INFO] [2025-11-28 10:20:47,814] [202511281020383644780] [445336:140139240544064] - [task_executor._run_] [line:158]: run reader_0 202511281020383644780_reader_0 0 on host 10000 task
[INFO] [2025-11-28 10:20:47,815] [202511281020383644780] [445336:140139240544064] - [task_executor._run_] [line:159]: component parameters on party:
{
    "dsl_version": 2,
    "initiator": {
        "role": "guest",
        "party_id": 9999
    },
    "role": {
        "guest": [
            9999
        ],
        "host": [
            10000,
            9000
        ]
    },
    "job_parameters": {
        "job_type": "train",
        "inheritance_info": {},
        "computing_engine": "STANDALONE",
        "federation_engine": "STANDALONE",
        "storage_engine": "STANDALONE",
        "engines_address": {
            "computing": {
                "cores_per_node": 4,
                "nodes": 4
            },
            "federation": {
                "cores_per_node": 4,
                "nodes": 4
            },
            "storage": {
                "cores_per_node": 4,
                "nodes": 4
            }
        },
        "federated_mode": "SINGLE",
        "task_parallelism": 1,
        "computing_partitions": 4,
        "federated_status_collect_type": "PUSH",
        "model_id": "arbiter-9999#guest-9999#host-9000_10000#model",
        "model_version": "202511281020383644780",
        "auto_retries": 0,
        "auto_retry_delay": 1,
        "eggroll_run": {
            "eggroll.session.processors.per.node": 1
        },
        "spark_run": {},
        "rabbitmq_run": {},
        "pulsar_run": {},
        "adaptation_parameters": {
            "task_nodes": 4,
            "task_cores_per_node": 1,
            "task_memory_per_node": 0,
            "request_task_cores": 1,
            "if_initiator_baseline": false
        },
        "task_conf": {},
        "roles": {
            "guest": [
                9999
            ],
            "host": [
                10000,
                9000
            ],
            "arbiter": [
                9999
            ]
        },
        "role_parameters": {}
    },
    "local": {
        "role": "host",
        "party_id": 10000
    },
    "module": "Reader",
    "CodePath": "Reader",
    "ComponentParam": {
        "table": {
            "name": "mnist_train_changeB",
            "namespace": "experiment"
        },
        "_name": "Reader#reader_0"
    },
    "ComponentParameterSource": "reader_0"
}

[INFO] [2025-11-28 10:20:47,996] [202511281020383644780] [445336:140139240544064] - [reader.save_table] [line:207]: source table name: mnist_train_changeB namespace: experiment engine: PATH
[INFO] [2025-11-28 10:20:47,996] [202511281020383644780] [445336:140139240544064] - [reader.save_table] [line:210]: destination table name: e8278626cc4311f087dc0242ac110005 namespace: output_data_202511281020383644780_reader_0_0 engine: PATH

# nn_0
[DEBUG] [2025-11-28 10:23:15,187] [202511281020383644780] [445525:140606553147200] - [data.load_dataset] [line:30]: use cached dataset, cached id /data/projects/fate/fate/upload/315/mnist_train_changeB

# eval_0未出现

请问如何解决,或者如何排查。目前训练是成功的。

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions