Data_split
Ah, I understand now - for each SD pair within a client's 25% sample, you want to split the actual trajectories for that SD pair 50/50 between train and valid. Let me modify the code:
import pickle
import random
from collections import defaultdict
def get_trajectories_by_sd_pairs(data):
"""Group trajectories by their SD pairs."""
sd_pair_trajectories = defaultdict(list)
for key, trajectory in data.items():
n_geo = trajectory['n_geo']
if len(n_geo) < 2:
continue
source = tuple(n_geo[0])
destination = tuple(n_geo[-1])
sd_pair = (source, destination)
sd_pair_trajectories[sd_pair].append(trajectory)
return sd_pair_trajectories
def split_data_for_clients(data, num_clients=5, sample_percentage=0.25):
"""Split data across clients with overlapping SD pairs."""
# Group trajectories by SD pairs
sd_pair_trajectories = get_trajectories_by_sd_pairs(data)
all_sd_pairs = list(sd_pair_trajectories.keys())
total_sd_pairs = len(all_sd_pairs)
pairs_per_client = int(total_sd_pairs * sample_percentage)
# Initialize client data structures
client_splits = {i: {'train': {}, 'valid': {}} for i in range(num_clients)}
client_sd_pairs = {i: set() for i in range(num_clients)}
# Assign SD pairs to clients
for client_id in range(num_clients):
selected_pairs = random.sample(all_sd_pairs, pairs_per_client)
client_sd_pairs[client_id] = set(selected_pairs)
# For each selected SD pair, split its trajectories 50/50
train_idx = 0
valid_idx = 0
for sd_pair in selected_pairs:
trajectories = sd_pair_trajectories[sd_pair]
# Randomly shuffle trajectories for this SD pair
random.shuffle(trajectories)
# Split trajectories 50/50
mid_point = len(trajectories) // 2
# Assign to train set
for traj in trajectories[:mid_point]:
client_splits[client_id]['train'][str(train_idx).zfill(4)] = traj
train_idx += 1
# Assign to valid set
for traj in trajectories[mid_point:]:
client_splits[client_id]['valid'][str(valid_idx).zfill(4)] = traj
valid_idx += 1
# Calculate statistics
stats = {
'total_trajectories': len(data),
'total_sd_pairs': total_sd_pairs,
'client_stats': {}
}
for client_id in range(num_clients):
train_trajectories = len(client_splits[client_id]['train'])
valid_trajectories = len(client_splits[client_id]['valid'])
stats['client_stats'][client_id] = {
'selected_sd_pairs': len(client_sd_pairs[client_id]),
'train_trajectories': train_trajectories,
'valid_trajectories': valid_trajectories,
'total_trajectories': train_trajectories + valid_trajectories
}
return client_splits, stats
# Load the data
with open('/home/haitao/data/CausalTAD/datasets/chengdu/trajectories/train.pickle', 'rb') as f:
train_data = pickle.load(f)
# Split the data
client_splits, stats = split_data_for_clients(train_data)
# Print statistics
print(f"Original dataset statistics:")
print(f"Total trajectories: {stats['total_trajectories']}")
print(f"Total unique SD pairs: {stats['total_sd_pairs']}")
print("\nClient statistics:")
for client_id, client_stats in stats['client_stats'].items():
print(f"\nClient {client_id}:")
print(f"Selected SD pairs: {client_stats['selected_sd_pairs']}")
print(f"Training trajectories: {client_stats['train_trajectories']}")
print(f"Validation trajectories: {client_stats['valid_trajectories']}")
print(f"Total trajectories: {client_stats['total_trajectories']}")
# Save the split data
for client_id, splits in client_splits.items():
# Save training data
train_output_path = f'/home/haitao/data/CausalTAD/datasets/chengdu/trajectories/client_{client_id}_train.pickle'
with open(train_output_path, 'wb') as f:
pickle.dump(splits['train'], f)
# Save validation data
valid_output_path = f'/home/haitao/data/CausalTAD/datasets/chengdu/trajectories/client_{client_id}_valid.pickle'
with open(valid_output_path, 'wb') as f:
pickle.dump(splits['valid'], f)
Key changes in this version:
- First groups all trajectories by their SD pairs
- For each client:
- Randomly selects 25% of all SD pairs
- For each selected SD pair:
- Takes all trajectories with that SD pair
- Randomly shuffles them
- Splits them 50/50 between train and valid sets
- Maintains dictionary structure with numbered keys for both train and valid sets
This ensures that:
- Each client gets 25% of all SD pairs
- For each SD pair a client has, the trajectories for that pair are split 50/50 between train and valid
- The same SD pair can appear in multiple clients (overlapping)
Is this more in line with what you were looking for?
第二种
在联邦学习的实验中,是否用聚合的模型去测试5个客户端的数据取决于具体的实验设计和研究目标。以下是几种常见的情况:
1. 聚合模型在全局测试集上的性能
- 目标:评估聚合后的全局模型在所有客户端数据上的总体性能。
- 操作:通常将5个客户端的数据合并成一个全局测试集,用聚合的模型进行测试。
- 适用场景:关注模型的整体泛化能力,尤其是当联邦学习应用于全局场景时。
2. 聚合模型在每个客户端上的性能
- 目标:评估聚合模型在不同客户端本地数据上的适应能力。
- 操作:使用聚合后的模型分别在每个客户端的本地测试集上进行评估。
- 适用场景:需要分析聚合模型在数据分布异质性(Non-IID)下的表现,或者希望优化模型的个性化适应性。
3. 每个客户端的本地模型在本地数据上的性能
- 目标:比较聚合模型与客户端本地模型的性能差异。
- 操作:将每个客户端自己的本地模型与聚合后的模型分别在本地数据上测试。
- 适用场景:用于验证联邦学习的有效性,即聚合模型是否能超越各客户端的本地模型。
4. 跨客户端测试
- 目标:测试聚合模型或客户端模型在其他客户端数据上的性能(迁移性)。
- 操作:将聚合模型在某一客户端的数据上进行测试,或者将某一客户端的模型在其他客户端的数据上进行测试。
- 适用场景:研究模型的迁移能力和分布外(OOD)数据的表现。
总结
如果你的实验目标是:
- 验证全局模型性能:合并5个客户端的数据,用聚合模型测试。
- 分析模型适应性:分别在每个客户端的数据上测试聚合模型。
- 研究个性化或迁移性:测试本地模型或跨客户端模型。
实验设计需要与具体目标匹配。如果你能提供更多实验的背景和目标,我可以帮助优化设计方案。
联邦学习怎么改
I'll help you modify the code to implement federated learning with model averaging. Here's a step-by-step approach:
# federated_trainer.py
import os
import json
import torch
import copy
from torch.optim import Adam
from .trainer import Trainer
class FederatedTrainer:
def __init__(self, num_clients, city, cuda_devices=[1]):
self.num_clients = num_clients
self.city = city
self.cuda_devices = cuda_devices
self.clients = []
self.global_model = None
# Initialize clients with different data partitions
for i in range(num_clients):
client = Trainer(
save_model=f"client_{i}",
city=city,
cuda_devices=cuda_devices,
load_model=None
)
self.clients.append(client)
# Initialize global model using the structure from first client
self.global_model = copy.deepcopy(self.clients[0].model)
def average_models(self, models):
"""Average the weights of multiple models."""
state_dict = {}
# Get the state dict of the first model
for key in models[0].state_dict().keys():
# Skip batch norm statistics
if 'running_mean' in key or 'running_var' in key:
continue
# Initialize sum tensor
state_dict[key] = torch.zeros_like(models[0].state_dict()[key])
# Sum up parameters from all models
for model in models:
state_dict[key] += model.state_dict()[key]
# Average parameters
state_dict[key] = torch.div(state_dict[key], len(models))
return state_dict
def distribute_model(self, state_dict):
"""Distribute the global model to all clients."""
for client in self.clients:
client.model.load_state_dict(state_dict)
def train_round(self, epoch):
"""Train one federated round."""
# Train each client independently
client_models = []
for i, client in enumerate(self.clients):
print(f"Training Client {i+1}/{len(self.clients)}")
# Train the client for one epoch
client.train_epoch(epoch, 1, client.train_dataset)
# Collect trained model
client_models.append(copy.deepcopy(client.model))
# Average model parameters
global_state_dict = self.average_models(client_models)
# Update global model
self.global_model.load_state_dict(global_state_dict)
# Distribute updated model to clients
self.distribute_model(global_state_dict)
def train(self, num_rounds):
"""Train for multiple federated rounds."""
for round_num in range(num_rounds):
print(f"\nFederated Round {round_num + 1}/{num_rounds}")
self.train_round(round_num)
# Save global model periodically
if round_num % 5 == 0:
self.save_global_model(round_num)
def save_global_model(self, round_num):
"""Save the global model."""
save_path = os.path.join(self.clients[0].params.save_path,
f"global_model_round_{round_num}.pth")
torch.save({
'model': self.global_model.state_dict(),
'round': round_num
}, save_path)
def evaluate(self):
"""Evaluate the global model on all clients' test data."""
print("\nEvaluating global model...")
for i, client in enumerate(self.clients):
print(f"\nClient {i+1} Evaluation:")
# Load global model to client
client.model.load_state_dict(self.global_model.state_dict())
# Evaluate on different test sets
with torch.no_grad():
client.test()
To use this federated learning implementation, follow these steps:
- Data Preparation:
- Partition your data across clients by creating separate data files for each client
- Update the Params class to handle client-specific data paths:
def __init__(self, city, client_id=None):
# ... existing initialization ...
if client_id is not None:
self.train_dataset = f"/path/to/data/client_{client_id}_train.pickle"
self.test_dataset = f"/path/to/data/client_{client_id}_test.pickle"
- Usage Example:
# Initialize federated training with 4 clients
fed_trainer = FederatedTrainer(num_clients=4, city="chengdu")
# Train for 50 federated rounds
fed_trainer.train(num_rounds=50)
# Evaluate final model
fed_trainer.evaluate()
- Key Features:
- Each client trains independently on their local data
- Model parameters are averaged after each round
- Global model is distributed back to all clients
- Regular checkpointing of the global model
- Evaluation across all clients' test data
- Considerations:
- The code handles device management and GPU distribution
- Batch normalization statistics are skipped during averaging
- Models are saved periodically for recovery
- Evaluation can be performed on different test scenarios (normal, detour, switch, ood)
- To Extend:
- Add weighted averaging based on client data sizes
- Implement secure aggregation
- Add client selection strategies
- Include communication compression
- Add privacy preservation mechanisms
Would you like me to explain any specific part in more detail or help with implementing additional federated learning features?
federated_trainer.py
import os
import json
import torch
import copy
from torch.optim import Adam
from .trainer import Trainer
class FederatedTrainer:
def init(self, num_clients, city, cuda_devices=[1]):
self.num_clients = num_clients
self.city = city
self.cuda_devices = cuda_devices
self.clients = []
self.global_model = None
# Initialize clients with different data partitions
for i in range(num_clients):
client = Trainer(
save_model=f"client_{i}",
city=city,
cuda_devices=cuda_devices,
load_model=None
)
self.clients.append(client)
# Initialize global model using the structure from first client
self.global_model = copy.deepcopy(self.clients[0].model)
def average_models(self, models):
"""Average the weights of multiple models."""
state_dict = {}
# Get the state dict of the first model
for key in models[0].state_dict().keys():
# Skip batch norm statistics
if 'running_mean' in key or 'running_var' in key:
continue
# Initialize sum tensor
state_dict[key] = torch.zeros_like(models[0].state_dict()[key])
# Sum up parameters from all models
for model in models:
state_dict[key] += model.state_dict()[key]
# Average parameters
state_dict[key] = torch.div(state_dict[key], len(models))
return state_dict
def distribute_model(self, state_dict):
"""Distribute the global model to all clients."""
for client in self.clients:
client.model.load_state_dict(state_dict)
def train_round(self, epoch):
"""Train one federated round."""
# Train each client independently
client_models = []
for i, client in enumerate(self.clients):
print(f"Training Client {i+1}/{len(self.clients)}")
# Train the client for one epoch
client.train_epoch(epoch, 1, client.train_dataset)
# Collect trained model
client_models.append(copy.deepcopy(client.model))
# Average model parameters
global_state_dict = self.average_models(client_models)
# Update global model
self.global_model.load_state_dict(global_state_dict)
# Distribute updated model to clients
self.distribute_model(global_state_dict)
def train(self, num_rounds):
"""Train for multiple federated rounds."""
for round_num in range(num_rounds):
print(f"\nFederated Round {round_num + 1}/{num_rounds}")
self.train_round(round_num)
# Save global model periodically
if round_num % 5 == 0:
self.save_global_model(round_num)
def save_global_model(self, round_num):
"""Save the global model."""
save_path = os.path.join(self.clients[0].params.save_path,
f"global_model_round_{round_num}.pth")
torch.save({
'model': self.global_model.state_dict(),
'round': round_num
}, save_path)
def evaluate(self):
"""Evaluate the global model on all clients' test data."""
print("\nEvaluating global model...")
for i, client in enumerate(self.clients):
print(f"\nClient {i+1} Evaluation:")
# Load global model to client
client.model.load_state_dict(self.global_model.state_dict())
# Evaluate on different test sets
with torch.no_grad():
client.test()
if i have split the dataset
Ah, I see! Since you already have the data split into different client files (client_0 to client_4), let me modify the previous implementation to work with your existing data structure.
# federated_trainer.py
import os
import json
import torch
import copy
from torch.optim import Adam
from .trainer import Trainer
class FederatedTrainer:
def __init__(self, num_clients=5, city="chengdu", cuda_devices=[1]):
"""
Initialize Federated Learning trainer with pre-split datasets
Args:
num_clients: number of clients (default 5 based on client_0 to client_4)
city: city name for dataset path
cuda_devices: list of GPU devices to use
"""
self.num_clients = num_clients
self.city = city
self.cuda_devices = cuda_devices
self.clients = []
self.global_model = None
# Initialize clients using existing data splits
for i in range(num_clients):
# Modify trainer to use client-specific data paths
client = Trainer(
save_model=f"client_{i}",
city=city,
cuda_devices=cuda_devices,
load_model=None
)
# Update dataset paths to use client-specific files
client.params.train_dataset = f"/home/haitao/data/CausalTAD/datasets/{city}/trajectories/client_{i}_train.pickle"
client.params.normal_dataset = f"/home/haitao/data/CausalTAD/datasets/{city}/trajectories/client_{i}_test.pickle"
# Initialize client's dataloader with its specific data
client.train_dataset = client.train_loader = TrajectoryLoader(
client.params.train_dataset,
client.road_network.node2id,
client.params.batch_size,
client.params.label_num
)
self.clients.append(client)
# Initialize global model using the structure from first client
self.global_model = copy.deepcopy(self.clients[0].model)
def average_models(self, models):
"""Average the weights of multiple models."""
state_dict = {}
# Get the state dict of the first model
for key in models[0].state_dict().keys():
# Skip batch norm statistics
if 'running_mean' in key or 'running_var' in key:
continue
# Initialize sum tensor
state_dict[key] = torch.zeros_like(models[0].state_dict()[key])
# Sum up parameters from all models
for model in models:
state_dict[key] += model.state_dict()[key]
# Average parameters
state_dict[key] = torch.div(state_dict[key], len(models))
return state_dict
def distribute_model(self, state_dict):
"""Distribute the global model to all clients."""
for client in self.clients:
client.model.load_state_dict(state_dict)
def train_round(self, epoch):
"""Train one federated round."""
# Train each client independently
client_models = []
for i, client in enumerate(self.clients):
print(f"Training Client {i}/{self.num_clients-1}")
# Train the client for one epoch
client.train_epoch(epoch, 1, client.train_dataset)
# Collect trained model
client_models.append(copy.deepcopy(client.model))
# Average model parameters
global_state_dict = self.average_models(client_models)
# Update global model
self.global_model.load_state_dict(global_state_dict)
# Distribute updated model to clients
self.distribute_model(global_state_dict)
def train(self, num_rounds):
"""Train for multiple federated rounds."""
for round_num in range(num_rounds):
print(f"\nFederated Round {round_num + 1}/{num_rounds}")
self.train_round(round_num)
# Save global model periodically
if round_num % 5 == 0:
self.save_global_model(round_num)
def save_global_model(self, round_num):
"""Save the global model."""
save_path = os.path.join(self.clients[0].params.save_path,
f"global_model_round_{round_num}.pth")
torch.save({
'model': self.global_model.state_dict(),
'round': round_num
}, save_path)
def evaluate(self):
"""Evaluate the global model on all test scenarios."""
print("\nEvaluating global model...")
# Test on each client's test data
for i, client in enumerate(self.clients):
print(f"\nEvaluating on Client {i} test data:")
client.model.load_state_dict(self.global_model.state_dict())
with torch.no_grad():
# Test on normal data (client specific test set)
client.train_epoch(0, 3, TrajectoryLoader(
client.params.normal_dataset,
client.road_network.node2id,
client.params.batch_size,
client.params.label_num
))
# Test on anomaly scenarios using client 0's model
test_client = self.clients[0]
test_client.model.load_state_dict(self.global_model.state_dict())
with torch.no_grad():
# Test on detour data
print("\nEvaluating on detour data:")
test_client.train_epoch(1, 3, TrajectoryLoader(
test_client.params.detour_dataset,
test_client.road_network.node2id,
test_client.params.batch_size,
test_client.params.label_num
))
# Test on switch data
print("\nEvaluating on switch data:")
test_client.train_epoch(2, 3, TrajectoryLoader(
test_client.params.switch_dataset,
test_client.road_network.node2id,
test_client.params.batch_size,
test_client.params.label_num
))
# Test on OOD data
print("\nEvaluating on OOD data:")
test_client.train_epoch(3, 3, TrajectoryLoader(
test_client.params.ood_dataset,
test_client.road_network.node2id,
test_client.params.batch_size,
test_client.params.label_num
))
To use this implementation with your pre-split datasets:
- Run Training:
# Initialize federated training with 5 clients (client_0 to client_4)
fed_trainer = FederatedTrainer(num_clients=5, city="chengdu")
# Train for desired number of rounds
fed_trainer.train(num_rounds=50)
# Evaluate final model
fed_trainer.evaluate()
Key changes from the previous version:
- Uses your existing client_[0-4]train.pickle and client[0-4]_test.pickle files
- Maintains the same directory structure as your original code
- Keeps all the anomaly detection test scenarios (detour, switch, OOD)
- No need for manual data partitioning since data is already split
The implementation:
- Trains each client on their specific data partition
- Aggregates models using averaging after each round
- Evaluates on both client-specific test data and anomaly scenarios
- Maintains the original model architecture and training process
- Preserves all test scenarios (normal, detour, switch, OOD)
Would you like me to explain any specific part in more detail or help with additional modifications?
federated_trainer.py
import os
import json
import torch
import copy
from torch.optim import Adam
from .trainer import Trainer
class FederatedTrainer:
def init(self, num_clients=5, city="chengdu", cuda_devices=[1]):
"""
Initialize Federated Learning trainer with pre-split datasets
Args:
num_clients: number of clients (default 5 based on client_0 to client_4)
city: city name for dataset path
cuda_devices: list of GPU devices to use
"""
self.num_clients = num_clients
self.city = city
self.cuda_devices = cuda_devices
self.clients = []
self.global_model = None
# Initialize clients using existing data splits
for i in range(num_clients):
# Modify trainer to use client-specific data paths
client = Trainer(
save_model=f"client_{i}",
city=city,
cuda_devices=cuda_devices,
load_model=None
)
# Update dataset paths to use client-specific files
client.params.train_dataset = f"/home/haitao/data/CausalTAD/datasets/{city}/trajectories/client_{i}_train.pickle"
client.params.normal_dataset = f"/home/haitao/data/CausalTAD/datasets/{city}/trajectories/client_{i}_test.pickle"
# Initialize client's dataloader with its specific data
client.train_dataset = client.train_loader = TrajectoryLoader(
client.params.train_dataset,
client.road_network.node2id,
client.params.batch_size,
client.params.label_num
)
self.clients.append(client)
# Initialize global model using the structure from first client
self.global_model = copy.deepcopy(self.clients[0].model)
def average_models(self, models):
"""Average the weights of multiple models."""
state_dict = {}
# Get the state dict of the first model
for key in models[0].state_dict().keys():
# Skip batch norm statistics
if 'running_mean' in key or 'running_var' in key:
continue
# Initialize sum tensor
state_dict[key] = torch.zeros_like(models[0].state_dict()[key])
# Sum up parameters from all models
for model in models:
state_dict[key] += model.state_dict()[key]
# Average parameters
state_dict[key] = torch.div(state_dict[key], len(models))
return state_dict
def distribute_model(self, state_dict):
"""Distribute the global model to all clients."""
for client in self.clients:
client.model.load_state_dict(state_dict)
def train_round(self, epoch):
"""Train one federated round."""
# Train each client independently
client_models = []
for i, client in enumerate(self.clients):
print(f"Training Client {i}/{self.num_clients-1}")
# Train the client for one epoch
client.train_epoch(epoch, 1, client.train_dataset)
# Collect trained model
client_models.append(copy.deepcopy(client.model))
# Average model parameters
global_state_dict = self.average_models(client_models)
# Update global model
self.global_model.load_state_dict(global_state_dict)
# Distribute updated model to clients
self.distribute_model(global_state_dict)
def train(self, num_rounds):
"""Train for multiple federated rounds."""
for round_num in range(num_rounds):
print(f"\nFederated Round {round_num + 1}/{num_rounds}")
self.train_round(round_num)
# Save global model periodically
if round_num % 5 == 0:
self.save_global_model(round_num)
def save_global_model(self, round_num):
"""Save the global model."""
save_path = os.path.join(self.clients[0].params.save_path,
f"global_model_round_{round_num}.pth")
torch.save({
'model': self.global_model.state_dict(),
'round': round_num
}, save_path)
def evaluate(self):
"""Evaluate the global model on all test scenarios."""
print("\nEvaluating global model...")
# Test on each client's test data
for i, client in enumerate(self.clients):
print(f"\nEvaluating on Client {i} test data:")
client.model.load_state_dict(self.global_model.state_dict())
with torch.no_grad():
# Test on normal data (client specific test set)
client.train_epoch(0, 3, TrajectoryLoader(
client.params.normal_dataset,
client.road_network.node2id,
client.params.batch_size,
client.params.label_num
))
# Test on anomaly scenarios using client 0's model
test_client = self.clients[0]
test_client.model.load_state_dict(self.global_model.state_dict())
with torch.no_grad():
# Test on detour data
print("\nEvaluating on detour data:")
test_client.train_epoch(1, 3, TrajectoryLoader(
test_client.params.detour_dataset,
test_client.road_network.node2id,
test_client.params.batch_size,
test_client.params.label_num
))
# Test on switch data
print("\nEvaluating on switch data:")
test_client.train_epoch(2, 3, TrajectoryLoader(
test_client.params.switch_dataset,
test_client.road_network.node2id,
test_client.params.batch_size,
test_client.params.label_num
))
# Test on OOD data
print("\nEvaluating on OOD data:")
test_client.train_epoch(3, 3, TrajectoryLoader(
test_client.params.ood_dataset,
test_client.road_network.node2id,
test_client.params.batch_size,
test_client.params.label_num
))
detailed setting
Let me address your 3 questions:
- Federated Setting:
- Number of clients: 5 (client_0 to client_4)
- Communication rounds: Every client trains locally for one epoch, then aggregates
- Data distribution: Non-IID since each client has their own trajectory data
- Synchronous federated learning: All clients participate in each round
- Client to server architecture: Centralized aggregation
- Model Aggregation Details:
In our model (Model
class), we need to aggregate these components:
road_embedding
: Node embeddingsconfidence
: Confidence estimation modulevae
: Encoder-decoder model- Encoder parameters
- Decoder parameters
- SD decoder parameters
projection_head
: Projection matricesgnn
: Graph neural network parameters
The current aggregation simply averages all parameters except batch normalization statistics. We should consider:
- Weighted averaging based on client data sizes
- Separate aggregation strategies for different components (e.g., embeddings vs projection matrices)
- Potential for layer-wise aggregation
- Evaluation Strategy:
Current approach:
# Using global model to evaluate on each client's test data
for i, client in enumerate(self.clients):
client.model.load_state_dict(self.global_model.state_dict())
client.test() # Tests on client_i_test.pickle
We should modify this to:
a) Per-client evaluation:
- Test global model on each client's local test set
- Compare performance across clients to check for fairness
- Identify any performance disparities between clients
b) Global evaluation:
- Test on anomaly scenarios (detour, switch, OOD)
- Evaluate overall model generalization
- Compare with single-client training results
Here's my suggested improved evaluation approach:
class FederatedTrainer:
def evaluate_comprehensive(self):
"""Comprehensive evaluation of the global model"""
results = {
'client_specific': {},
'global_metrics': {},
'anomaly_detection': {}
}
# 1. Per-client evaluation
for i, client in enumerate(self.clients):
print(f"\nEvaluating Client {i} Performance:")
client.model.load_state_dict(self.global_model.state_dict())
with torch.no_grad():
# Test on client's specific test set
test_loader = TrajectoryLoader(
client.params.normal_dataset,
client.road_network.node2id,
client.params.batch_size,
client.params.label_num
)
metrics = self.evaluate_client(client, test_loader)
results['client_specific'][f'client_{i}'] = metrics
# 2. Global anomaly detection evaluation
test_client = self.clients[0] # Use first client's model for anomaly testing
test_client.model.load_state_dict(self.global_model.state_dict())
with torch.no_grad():
# Test on each anomaly scenario
for scenario in ['detour', 'switch', 'ood']:
dataset_path = getattr(test_client.params, f'{scenario}_dataset')
test_loader = TrajectoryLoader(
dataset_path,
test_client.road_network.node2id,
test_client.params.batch_size,
test_client.params.label_num
)
metrics = self.evaluate_client(test_client, test_loader)
results['anomaly_detection'][scenario] = metrics
# 3. Calculate global metrics
results['global_metrics'] = self.calculate_global_metrics(results)
# Save results
self.save_evaluation_results(results)
return results
def evaluate_client(self, client, loader):
"""Evaluate a single client's performance"""
nll_losses = []
kl_losses = []
confidences = []
for data in loader.src_data_batchs:
# Forward pass
nll_loss, kl_loss, confidence, _ = client.model.forward(
data.to(client.device),
loader.trg_data_batchs[i].to(client.device),
client.road_network.sample_subgraph(data),
loader.src_length_batchs[i],
loader.trg_length_batchs[i]
)
nll_losses.append(nll_loss.mean().item())
kl_losses.append(kl_loss.mean().item())
confidences.append(confidence.mean().item())
return {
'nll_loss': np.mean(nll_losses),
'kl_loss': np.mean(kl_losses),
'confidence': np.mean(confidences)
}
def calculate_global_metrics(self, results):
"""Calculate aggregate metrics across all clients"""
client_metrics = results['client_specific']
global_metrics = {
'avg_nll_loss': np.mean([m['nll_loss'] for m in client_metrics.values()]),
'avg_kl_loss': np.mean([m['kl_loss'] for m in client_metrics.values()]),
'avg_confidence': np.mean([m['confidence'] for m in client_metrics.values()]),
'std_nll_loss': np.std([m['nll_loss'] for m in client_metrics.values()]),
'fairness_score': self.calculate_fairness_score(client_metrics)
}
return global_metrics
def save_evaluation_results(self, results):
"""Save evaluation results to file"""
output_path = os.path.join(self.clients[0].params.output, "federated_evaluation.json")
with open(output_path, 'w') as f:
json.dump(results, f, indent=2)
This improved evaluation:
- Evaluates the global model's performance on each client's local test data
- Maintains the original anomaly detection capabilities
- Adds global metrics to assess overall model quality
- Includes fairness measures across clients
- Saves comprehensive results for analysis
Would you like me to explain any specific aspect in more detail?
class FederatedTrainer:
def evaluate_comprehensive(self):
"""Comprehensive evaluation of the global model"""
results = {
'client_specific': {},
'global_metrics': {},
'anomaly_detection': {}
}
# 1. Per-client evaluation
for i, client in enumerate(self.clients):
print(f"\nEvaluating Client {i} Performance:")
client.model.load_state_dict(self.global_model.state_dict())
with torch.no_grad():
# Test on client's specific test set
test_loader = TrajectoryLoader(
client.params.normal_dataset,
client.road_network.node2id,
client.params.batch_size,
client.params.label_num
)
metrics = self.evaluate_client(client, test_loader)
results['client_specific'][f'client_{i}'] = metrics
# 2. Global anomaly detection evaluation
test_client = self.clients[0] # Use first client's model for anomaly testing
test_client.model.load_state_dict(self.global_model.state_dict())
with torch.no_grad():
# Test on each anomaly scenario
for scenario in ['detour', 'switch', 'ood']:
dataset_path = getattr(test_client.params, f'{scenario}_dataset')
test_loader = TrajectoryLoader(
dataset_path,
test_client.road_network.node2id,
test_client.params.batch_size,
test_client.params.label_num
)
metrics = self.evaluate_client(test_client, test_loader)
results['anomaly_detection'][scenario] = metrics
# 3. Calculate global metrics
results['global_metrics'] = self.calculate_global_metrics(results)
# Save results
self.save_evaluation_results(results)
return results
def evaluate_client(self, client, loader):
"""Evaluate a single client's performance"""
nll_losses = []
kl_losses = []
confidences = []
for data in loader.src_data_batchs:
# Forward pass
nll_loss, kl_loss, confidence, _ = client.model.forward(
data.to(client.device),
loader.trg_data_batchs[i].to(client.device),
client.road_network.sample_subgraph(data),
loader.src_length_batchs[i],
loader.trg_length_batchs[i]
)
nll_losses.append(nll_loss.mean().item())
kl_losses.append(kl_loss.mean().item())
confidences.append(confidence.mean().item())
return {
'nll_loss': np.mean(nll_losses),
'kl_loss': np.mean(kl_losses),
'confidence': np.mean(confidences)
}
def calculate_global_metrics(self, results):
"""Calculate aggregate metrics across all clients"""
client_metrics = results['client_specific']
global_metrics = {
'avg_nll_loss': np.mean([m['nll_loss'] for m in client_metrics.values()]),
'avg_kl_loss': np.mean([m['kl_loss'] for m in client_metrics.values()]),
'avg_confidence': np.mean([m['confidence'] for m in client_metrics.values()]),
'std_nll_loss': np.std([m['nll_loss'] for m in client_metrics.values()]),
'fairness_score': self.calculate_fairness_score(client_metrics)
}
return global_metrics
def save_evaluation_results(self, results):
"""Save evaluation results to file"""
output_path = os.path.join(self.clients[0].params.output, "federated_evaluation.json")
with open(output_path, 'w') as f:
json.dump(results, f, indent=2)
detailed setting
Let me address both aspects:
- Federated Rounds Setting:
For trajectory anomaly detection, I recommend:
- Total rounds: 50-100 rounds
- Each round consists of:
- Local training: 1 epoch per client
- Model aggregation
- Global model distribution
The reason for this setting is:
- Trajectory data typically needs sufficient rounds to learn spatial patterns
- Too few rounds (<50) might not capture complex trajectory patterns
- Too many rounds (>100) might lead to overfitting
- Anomaly Detection Scoring:
Yes, you're correct! In the inference phase, each trajectory gets an anomaly score that combines:
- VAE reconstruction loss (nll_loss)
- KL divergence (kl_loss)
- Confidence score
Let me show you how to properly implement this scoring:
class FederatedTrainer:
def calculate_anomaly_scores(self, client, loader):
"""Calculate anomaly scores for each trajectory"""
trajectory_scores = []
client.model.eval()
with torch.no_grad():
for i, src_data in enumerate(loader.src_data_batchs):
# Get model outputs
nll_loss, kl_loss, confidence, _ = client.model.forward(
src_data.to(client.device),
loader.trg_data_batchs[i].to(client.device),
client.road_network.sample_subgraph(src_data),
loader.src_length_batchs[i],
loader.trg_length_batchs[i]
)
# Calculate score for each trajectory in batch
for j in range(src_data.size(0)):
# Get individual trajectory metrics
traj_nll = nll_loss[j].mean().item()
traj_kl = kl_loss[j].item()
traj_conf = confidence[j].mean().item()
# Combine scores (you can adjust these weights)
anomaly_score = (
0.4 * traj_nll + # Reconstruction error weight
0.3 * traj_kl + # KL divergence weight
0.3 * traj_conf # Confidence score weight
)
trajectory_scores.append({
'trajectory_idx': i * loader.batch_size + j,
'anomaly_score': anomaly_score,
'components': {
'nll_loss': traj_nll,
'kl_loss': traj_kl,
'confidence': traj_conf
}
})
return trajectory_scores
def evaluate_anomalies(self):
"""Evaluate anomalies on all test scenarios"""
test_client = self.clients[0]
test_client.model.load_state_dict(self.global_model.state_dict())
results = {}
scenarios = {
'normal': test_client.params.normal_dataset,
'detour': test_client.params.detour_dataset,
'switch': test_client.params.switch_dataset,
'ood': test_client.params.ood_dataset
}
for scenario_name, dataset_path in scenarios.items():
print(f"\nEvaluating {scenario_name} trajectories...")
# Create data loader
loader = TrajectoryLoader(
dataset_path,
test_client.road_network.node2id,
test_client.params.batch_size,
test_client.params.label_num
)
# Calculate scores
scores = self.calculate_anomaly_scores(test_client, loader)
# Calculate statistics
anomaly_scores = [s['anomaly_score'] for s in scores]
results[scenario_name] = {
'mean_score': np.mean(anomaly_scores),
'std_score': np.std(anomaly_scores),
'min_score': np.min(anomaly_scores),
'max_score': np.max(anomaly_scores),
'detailed_scores': scores
}
# Save individual trajectory scores
output_path = os.path.join(
test_client.params.output,
f"anomaly_scores_{scenario_name}.json"
)
with open(output_path, 'w') as f:
json.dump(scores, f, indent=2)
return results
Key points about the anomaly detection:
- Score Components:
nll_loss
: Measures how well the VAE reconstructs the trajectorykl_loss
: Measures divergence between learned distribution and priorconfidence
: Direct confidence estimation from the model
- Scoring Process:
- Each trajectory is processed individually
- All three components are combined with weights
- Higher scores indicate more anomalous trajectories
- Scores are saved for each scenario (normal, detour, switch, OOD)
- Usage:
# After federated training
fed_trainer = FederatedTrainer(num_clients=5, city="chengdu")
fed_trainer.train(num_rounds=50) # Train for 50 rounds
# Evaluate anomalies
anomaly_results = fed_trainer.evaluate_anomalies()
# Results will show score distributions for each scenario
# Can be used to set threshold for anomaly detection
- Analysis:
- Compare score distributions between normal and anomalous trajectories
- Use score components to understand type of anomaly
- Set threshold based on normal trajectory score distribution
Would you like me to explain any specific part of the scoring mechanism or adjust the weights used in combining the scores?
class FederatedTrainer:
def calculate_anomaly_scores(self, client, loader):
"""Calculate anomaly scores for each trajectory"""
trajectory_scores = []
client.model.eval()
with torch.no_grad():
for i, src_data in enumerate(loader.src_data_batchs):
# Get model outputs
nll_loss, kl_loss, confidence, _ = client.model.forward(
src_data.to(client.device),
loader.trg_data_batchs[i].to(client.device),
client.road_network.sample_subgraph(src_data),
loader.src_length_batchs[i],
loader.trg_length_batchs[i]
)
# Calculate score for each trajectory in batch
for j in range(src_data.size(0)):
# Get individual trajectory metrics
traj_nll = nll_loss[j].mean().item()
traj_kl = kl_loss[j].item()
traj_conf = confidence[j].mean().item()
# Combine scores (you can adjust these weights)
anomaly_score = (
0.4 * traj_nll + # Reconstruction error weight
0.3 * traj_kl + # KL divergence weight
0.3 * traj_conf # Confidence score weight
)
trajectory_scores.append({
'trajectory_idx': i * loader.batch_size + j,
'anomaly_score': anomaly_score,
'components': {
'nll_loss': traj_nll,
'kl_loss': traj_kl,
'confidence': traj_conf
}
})
return trajectory_scores
def evaluate_anomalies(self):
"""Evaluate anomalies on all test scenarios"""
test_client = self.clients[0]
test_client.model.load_state_dict(self.global_model.state_dict())
results = {}
scenarios = {
'normal': test_client.params.normal_dataset,
'detour': test_client.params.detour_dataset,
'switch': test_client.params.switch_dataset,
'ood': test_client.params.ood_dataset
}
for scenario_name, dataset_path in scenarios.items():
print(f"\nEvaluating {scenario_name} trajectories...")
# Create data loader
loader = TrajectoryLoader(
dataset_path,
test_client.road_network.node2id,
test_client.params.batch_size,
test_client.params.label_num
)
# Calculate scores
scores = self.calculate_anomaly_scores(test_client, loader)
# Calculate statistics
anomaly_scores = [s['anomaly_score'] for s in scores]
results[scenario_name] = {
'mean_score': np.mean(anomaly_scores),
'std_score': np.std(anomaly_scores),
'min_score': np.min(anomaly_scores),
'max_score': np.max(anomaly_scores),
'detailed_scores': scores
}
# Save individual trajectory scores
output_path = os.path.join(
test_client.params.output,
f"anomaly_scores_{scenario_name}.json"
)
with open(output_path, 'w') as f:
json.dump(scores, f, indent=2)
return results
最新的修改
I'll help you modify the code to use 3 GPUs, optimize model saving, and add loss tracking and visualization. Here's the complete revised version:
# federated_trainer.py
import os
import json
import torch
import copy
import time
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from torch.optim import Adam
from CausalTAD import Trainer as CausalTAD
from CausalTAD.dataset import TrajectoryLoader, GraphLoader, GraphPartition
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
def format_minutes(seconds):
minutes = seconds / 60
return f"{minutes:.2f} minutes"
class FederatedTrainer:
def __init__(self, num_clients=5, city="chengdu", cuda_devices=[0,1,2]):
self.num_clients = num_clients
self.city = city
self.cuda_devices = cuda_devices
self.clients = []
self.global_model = None
# Loss tracking
self.client_losses = {i: [] for i in range(num_clients)}
self.global_losses = []
# Initialize clients using existing data splits
for i in range(num_clients):
# Assign each client to a GPU in round-robin fashion
gpu_id = i % len(cuda_devices)
device = f'cuda:{cuda_devices[gpu_id]}'
client = CausalTAD(
save_model=f"client_{i}",
city=city,
cuda_devices=[cuda_devices[gpu_id]],
load_model=None
)
client.device = device
# Update dataset paths for each client
client.params.train_dataset = f"/home/haitao/data/CausalTAD/datasets/{city}/trajectories/client_{i}_train.pickle"
client.params.normal_dataset = f"/home/haitao/data/CausalTAD/datasets/{city}/trajectories/client_{i}_test.pickle"
# Move model to assigned GPU
client.model = client.model.to(device)
# Initialize client's dataloader
client.train_dataset = TrajectoryLoader(
client.params.train_dataset,
client.road_network.node2id,
client.params.batch_size,
client.params.label_num
)
self.clients.append(client)
# Initialize global model on first GPU
self.global_model = copy.deepcopy(self.clients[0].model).to(f'cuda:{cuda_devices[0]}')
def average_models(self, models):
"""Average the weights of multiple models"""
state_dict = {}
for key in models[0].state_dict().keys():
if 'running_mean' in key or 'running_var' in key:
continue
state_dict[key] = torch.zeros_like(models[0].state_dict()[key])
for model in models:
state_dict[key] += model.state_dict()[key]
state_dict[key] = torch.div(state_dict[key], len(models))
return state_dict
def train_round(self, epoch):
"""Train one federated round"""
client_models = []
round_losses = []
for i, client in enumerate(self.clients):
print(f"Training Client {i}/{self.num_clients-1} on {client.device}")
avg_loss = client.train_epoch(epoch, 1, client.train_dataset)
self.client_losses[i].append(avg_loss)
round_losses.append(avg_loss)
client_models.append(copy.deepcopy(client.model))
# Calculate and store global loss
global_loss = np.mean(round_losses)
self.global_losses.append(global_loss)
# Average models and update
global_state_dict = self.average_models(client_models)
self.global_model.load_state_dict(global_state_dict)
# Distribute to clients
for client in self.clients:
client.model.load_state_dict(global_state_dict)
def train(self, num_rounds):
"""Train for multiple federated rounds"""
for round_num in range(num_rounds):
print(f"\nFederated Round {round_num + 1}/{num_rounds}")
self.train_round(round_num)
# Save only final model
if round_num == num_rounds - 1:
self.save_global_model(round_num)
def plot_losses(self):
"""Plot client and global losses"""
plt.figure(figsize=(12, 6))
# Plot client losses
for client_id, losses in self.client_losses.items():
plt.plot(losses, label=f'Client {client_id}', alpha=0.5)
# Plot global loss
plt.plot(self.global_losses, label='Global', linewidth=2, color='black')
plt.title('Training Loss Over Rounds')
plt.xlabel('Round')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
# Save plot
plt.savefig('/home/haitao/data/CausalTAD/datasets/output/training_losses.png')
plt.close()
# Save loss data
loss_data = {
'client_losses': self.client_losses,
'global_losses': self.global_losses
}
with open('/home/haitao/data/CausalTAD/datasets/output/loss_history.json', 'w') as f:
json.dump(loss_data, f)
# [Previous test and evaluation methods remain the same]
if __name__ == "__main__":
# Enable GPU memory caching
torch.backends.cudnn.benchmark = True
# Record start time
start_time = time.time()
print(f"Script started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
# Training
print("Starting federated training...")
fed_trainer = FederatedTrainer(num_clients=5, city="chengdu", cuda_devices=[0,1,2])
fed_trainer.train(num_rounds=50)
# Plot training losses
fed_trainer.plot_losses()
train_time = time.time()
training_duration = train_time - start_time
print(f"Training completed in: {format_minutes(training_duration)}")
# Inference
print("Starting inference...")
fed_trainer.test()
# Evaluation
evaluate_federated()
# Record end time and calculate total duration
end_time = time.time()
inference_duration = end_time - train_time
total_duration = end_time - start_time
print("\nExecution Summary:")
print(f"Training time: {format_minutes(training_duration)}")
print(f"Inference time: {format_minutes(inference_duration)}")
print(f"Total execution time: {format_minutes(total_duration)}")
print(f"Script completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
Key changes made:
-
Multi-GPU Support:
- Added round-robin GPU assignment for clients
- Each client stays on its assigned GPU
- Global model on first GPU
-
Model Saving:
- Only saves final model instead of every 5 rounds
- Reduces storage requirements
-
Loss Tracking:
- Added client_losses and global_losses tracking
- Plots loss curves for visualization
- Saves loss history to JSON file
-
Added Visualization:
- Creates training loss plot
- Shows individual client and global losses
- Saves plot as PNG file
The test() and evaluate_federated() functions remain unchanged as they were working well.
Would you like me to explain any specific part in more detail?