Checkpointing & Recovery

This guide shows how to save and restore evolution state for long-running optimizations.

Why Checkpointing?

  • Resume interrupted runs: Continue after crashes or shutdowns
  • Experiment branching: Try different strategies from the same point
  • Progress monitoring: Analyze intermediate states
  • Resource limits: Work within time-limited environments

Basic Checkpointing

Creating Checkpoints

use fugue_evo::prelude::*;
use std::path::PathBuf;

// Create checkpoint manager
let checkpoint_dir = PathBuf::from("./checkpoints");
let mut manager = CheckpointManager::new(&checkpoint_dir, "my_evolution")
    .every(50)    // Save every 50 generations
    .keep(3);     // Keep last 3 checkpoints

// In evolution loop
for gen in 0..max_generations {
    // ... evolution step ...

    if manager.should_save(gen + 1) {
        let individuals: Vec<Individual<RealVector>> = population.iter().cloned().collect();
        let checkpoint = Checkpoint::new(gen + 1, individuals)
            .with_evaluations((gen + 1) * population_size);

        manager.save(&checkpoint)?;
        println!("Saved checkpoint at generation {}", gen + 1);
    }
}

Loading Checkpoints

use fugue_evo::checkpoint::load_checkpoint;

// Load specific checkpoint
let checkpoint: Checkpoint<RealVector> = load_checkpoint("./checkpoints/my_evolution_gen_100.ckpt")?;

println!("Loaded generation: {}", checkpoint.generation);
println!("Population size: {}", checkpoint.population.len());
println!("Evaluations: {}", checkpoint.evaluations);

// Reconstruct population
let mut population: Population<RealVector, f64> =
    Population::with_capacity(checkpoint.population.len());
for ind in checkpoint.population {
    population.push(ind);
}

Complete Example

//! Checkpointing and Recovery
//!
//! This example demonstrates how to save and restore evolution state
//! using checkpoints. This is essential for long-running optimizations
//! that may need to be interrupted and resumed.

use fugue_evo::prelude::*;
use rand::rngs::StdRng;
use rand::SeedableRng;
use std::path::PathBuf;

fn main() -> Result<(), Box<dyn std::error::Error>> {
    println!("=== Checkpointing and Recovery ===\n");

    let checkpoint_dir = PathBuf::from("/tmp/fugue_evo_checkpoints");

    // Clean up any existing checkpoints first
    if checkpoint_dir.exists() {
        std::fs::remove_dir_all(&checkpoint_dir)?;
    }

    // Run with checkpoints
    run_with_checkpoints(&checkpoint_dir)?;

    // Demonstrate resuming (in real usage, this would be after a restart)
    println!("\n--- Simulating resume from checkpoint ---\n");
    resume_from_checkpoint(&checkpoint_dir)?;

    // Clean up
    if checkpoint_dir.exists() {
        std::fs::remove_dir_all(&checkpoint_dir)?;
        println!("\nCheckpoint directory cleaned up.");
    }

    Ok(())
}

fn run_with_checkpoints(checkpoint_dir: &PathBuf) -> Result<(), Box<dyn std::error::Error>> {
    let mut rng = StdRng::seed_from_u64(42);

    const DIM: usize = 10;
    let fitness = Sphere::new(DIM);
    let bounds = MultiBounds::symmetric(5.12, DIM);

    // Run evolution with periodic checkpoints
    let mut population: Population<RealVector, f64> = Population::random(100, &bounds, &mut rng);
    population.evaluate(&fitness);

    let selection = TournamentSelection::new(3);
    let crossover = SbxCrossover::new(20.0);
    let mutation = PolynomialMutation::new(20.0);

    // Create checkpoint manager
    let mut manager = CheckpointManager::new(checkpoint_dir, "evolution")
        .every(50) // Save every 50 generations
        .keep(3); // Keep last 3 checkpoints

    let max_generations = 200;

    for gen in 0..max_generations {
        // Evolution step
        let selection_pool: Vec<_> = population.as_fitness_pairs();
        let mut new_pop: Population<RealVector, f64> = Population::with_capacity(100);

        // Elitism
        if let Some(best) = population.best() {
            new_pop.push(best.clone());
        }

        while new_pop.len() < 100 {
            let p1_idx = selection.select(&selection_pool, &mut rng);
            let p2_idx = selection.select(&selection_pool, &mut rng);

            let (mut c1, mut c2) = crossover
                .crossover(
                    &selection_pool[p1_idx].0,
                    &selection_pool[p2_idx].0,
                    &mut rng,
                )
                .genome()
                .unwrap_or_else(|| {
                    (
                        selection_pool[p1_idx].0.clone(),
                        selection_pool[p2_idx].0.clone(),
                    )
                });

            mutation.mutate(&mut c1, &mut rng);
            mutation.mutate(&mut c2, &mut rng);

            new_pop.push(Individual::new(c1));
            if new_pop.len() < 100 {
                new_pop.push(Individual::new(c2));
            }
        }

        new_pop.evaluate(&fitness);
        new_pop.set_generation(gen + 1);
        population = new_pop;

        // Save checkpoint periodically
        if manager.should_save(gen + 1) {
            let best = population.best().unwrap();
            println!(
                "Gen {:3}: Best = {:.6} - Saving checkpoint...",
                gen + 1,
                best.fitness_value()
            );

            // Create checkpoint with current population
            let individuals: Vec<Individual<RealVector>> = population.iter().cloned().collect();
            let checkpoint =
                Checkpoint::new(gen + 1, individuals).with_evaluations((gen + 1) * 100);

            manager.save(&checkpoint)?;
        }
    }

    let best = population.best().unwrap();
    println!("\nFinal result:");
    println!("  Best fitness: {:.6}", best.fitness_value());
    println!("  Generations:  {}", max_generations);

    Ok(())
}

fn resume_from_checkpoint(checkpoint_dir: &PathBuf) -> Result<(), Box<dyn std::error::Error>> {
    // Find the latest checkpoint file
    let entries: Vec<_> = std::fs::read_dir(checkpoint_dir)?
        .filter_map(|e| e.ok())
        .filter(|e| e.path().extension().is_some_and(|ext| ext == "ckpt"))
        .collect();

    if entries.is_empty() {
        println!("No checkpoint files found!");
        return Ok(());
    }

    // Sort by name to get the latest
    let mut paths: Vec<_> = entries.iter().map(|e| e.path()).collect();
    paths.sort();
    let latest_checkpoint = paths.last().unwrap();

    println!("Loading checkpoint: {:?}", latest_checkpoint);

    // Load checkpoint
    let checkpoint: Checkpoint<RealVector> = load_checkpoint(latest_checkpoint)?;

    println!("Loaded checkpoint:");
    println!("  Generation: {}", checkpoint.generation);
    println!("  Population size: {}", checkpoint.population.len());
    println!("  Evaluations: {}", checkpoint.evaluations);

    // Find best in loaded population
    let best_individual = checkpoint
        .population
        .iter()
        .filter_map(|ind| ind.fitness.as_ref().map(|f| (ind, f.to_f64())))
        .max_by(|(_, f1), (_, f2)| f1.partial_cmp(f2).unwrap());

    if let Some((_best, fitness)) = best_individual {
        println!("  Best fitness at checkpoint: {:.6}", fitness);
    }

    // Continue evolution...
    let mut rng = StdRng::seed_from_u64(12345); // Different seed for continuation
    let fitness = Sphere::new(10);

    // Reconstruct population from checkpoint
    let mut population: Population<RealVector, f64> =
        Population::with_capacity(checkpoint.population.len());
    for ind in checkpoint.population {
        population.push(ind);
    }

    let selection = TournamentSelection::new(3);
    let crossover = SbxCrossover::new(20.0);
    let mutation = PolynomialMutation::new(20.0);

    let remaining_gens = 200 - checkpoint.generation;
    println!("\nContinuing for {} more generations...\n", remaining_gens);

    for gen in checkpoint.generation..200 {
        let selection_pool: Vec<_> = population.as_fitness_pairs();
        let mut new_pop: Population<RealVector, f64> = Population::with_capacity(100);

        if let Some(best) = population.best() {
            new_pop.push(best.clone());
        }

        while new_pop.len() < 100 {
            let p1_idx = selection.select(&selection_pool, &mut rng);
            let p2_idx = selection.select(&selection_pool, &mut rng);

            let (mut c1, mut c2) = crossover
                .crossover(
                    &selection_pool[p1_idx].0,
                    &selection_pool[p2_idx].0,
                    &mut rng,
                )
                .genome()
                .unwrap_or_else(|| {
                    (
                        selection_pool[p1_idx].0.clone(),
                        selection_pool[p2_idx].0.clone(),
                    )
                });

            mutation.mutate(&mut c1, &mut rng);
            mutation.mutate(&mut c2, &mut rng);

            new_pop.push(Individual::new(c1));
            if new_pop.len() < 100 {
                new_pop.push(Individual::new(c2));
            }
        }

        new_pop.evaluate(&fitness);
        new_pop.set_generation(gen + 1);
        population = new_pop;

        if (gen + 1) % 50 == 0 {
            let best = population.best().unwrap();
            println!("Gen {:3}: Best = {:.6}", gen + 1, best.fitness_value());
        }
    }

    let best = population.best().unwrap();
    println!("\nFinal result after resumption:");
    println!("  Best fitness: {:.6}", best.fitness_value());

    Ok(())
}

Source: examples/checkpointing.rs

Running the Example

cargo run --example checkpointing

Checkpoint Manager Options

Save Frequency

// Every N generations
CheckpointManager::new(&dir, "name").every(50);

// Only at specific generations
CheckpointManager::new(&dir, "name").at_generations(&[100, 200, 500]);

Retention Policy

// Keep last N checkpoints
CheckpointManager::new(&dir, "name").keep(3);

// Keep all checkpoints
CheckpointManager::new(&dir, "name").keep_all();

// Custom retention
CheckpointManager::new(&dir, "name").keep_every(100); // Keep every 100th

Custom Naming

// Default: name_gen_N.ckpt
let manager = CheckpointManager::new(&dir, "experiment_1");
// Creates: experiment_1_gen_50.ckpt, experiment_1_gen_100.ckpt, etc.

Checkpoint Contents

The Checkpoint struct stores:

pub struct Checkpoint<G: EvolutionaryGenome> {
    /// Current generation number
    pub generation: usize,

    /// Full population with fitness values
    pub population: Vec<Individual<G>>,

    /// Total fitness evaluations so far
    pub evaluations: usize,

    /// Optional metadata
    pub metadata: Option<CheckpointMetadata>,
}

Adding Metadata

let checkpoint = Checkpoint::new(gen, individuals)
    .with_evaluations(evaluations)
    .with_metadata(CheckpointMetadata {
        timestamp: chrono::Utc::now(),
        best_fitness: population.best().map(|b| *b.fitness_value()),
        config: serde_json::to_string(&config).ok(),
    });

Resume Strategy

Find Latest Checkpoint

fn find_latest_checkpoint(dir: &Path, prefix: &str) -> Option<PathBuf> {
    std::fs::read_dir(dir)
        .ok()?
        .filter_map(|e| e.ok())
        .filter(|e| {
            e.path()
                .file_name()
                .and_then(|n| n.to_str())
                .map(|n| n.starts_with(prefix) && n.ends_with(".ckpt"))
                .unwrap_or(false)
        })
        .max_by_key(|e| e.path())
        .map(|e| e.path())
}

Resume or Start Fresh

fn run_evolution(checkpoint_dir: &Path) -> Result<(), Box<dyn Error>> {
    let latest = find_latest_checkpoint(checkpoint_dir, "my_evolution");

    let (mut population, start_gen) = if let Some(path) = latest {
        println!("Resuming from {:?}", path);
        let ckpt: Checkpoint<RealVector> = load_checkpoint(&path)?;
        let pop = reconstruct_population(ckpt.population);
        (pop, ckpt.generation)
    } else {
        println!("Starting fresh");
        let pop = Population::random(100, &bounds, &mut rng);
        (pop, 0)
    };

    // Continue evolution from start_gen
    for gen in start_gen..max_generations {
        // ... evolution ...
    }

    Ok(())
}

Saving Algorithm State

For algorithms with internal state (like CMA-ES):

#[derive(Serialize, Deserialize)]
struct CmaEsCheckpoint {
    generation: usize,
    mean: Vec<f64>,
    sigma: f64,
    covariance: Vec<Vec<f64>>,
    // ... other CMA-ES state
}

impl CmaEsCheckpoint {
    fn from_cmaes(cmaes: &CmaEs) -> Self {
        Self {
            generation: cmaes.state.generation,
            mean: cmaes.state.mean.clone(),
            sigma: cmaes.state.sigma,
            covariance: cmaes.state.covariance.clone(),
        }
    }

    fn restore(&self) -> CmaEs {
        let mut cmaes = CmaEs::new(self.mean.clone(), self.sigma);
        cmaes.state.generation = self.generation;
        cmaes.state.covariance = self.covariance.clone();
        cmaes
    }
}

Error Handling

match load_checkpoint::<RealVector>(&path) {
    Ok(checkpoint) => {
        println!("Loaded successfully");
    }
    Err(CheckpointError::FileNotFound(path)) => {
        println!("Checkpoint not found: {:?}", path);
    }
    Err(CheckpointError::DeserializationFailed(err)) => {
        println!("Corrupted checkpoint: {}", err);
    }
    Err(e) => {
        println!("Unknown error: {}", e);
    }
}

Best Practices

1. Checkpoint Frequently Enough

// For long runs, checkpoint every ~5-10% of expected runtime
let interval = max_generations / 20;
manager.every(interval);

2. Verify Checkpoints

// After saving, verify it can be loaded
manager.save(&checkpoint)?;
let verified: Checkpoint<RealVector> = load_checkpoint(&manager.latest_path())?;
assert_eq!(verified.generation, checkpoint.generation);

3. Include Random State

For reproducible resumption, save RNG state:

use rand::SeedableRng;

#[derive(Serialize, Deserialize)]
struct FullCheckpoint<G> {
    evolution: Checkpoint<G>,
    rng_seed: u64, // Or full RNG state
}

4. Use Atomic Writes

Prevent corruption from interrupted saves:

// Write to temp file, then rename
let temp_path = path.with_extension("tmp");
write_checkpoint(&checkpoint, &temp_path)?;
std::fs::rename(temp_path, path)?;

Feature Flag

Checkpointing requires the checkpoint feature:

[dependencies]
fugue-evo = { version = "0.1", features = ["checkpoint"] }

Next Steps