@click.command()
@click.option("-e", "--experiment_name", required=True, help="MLflow training experiment name (e.g. 'paper-2025-06_training')")
@click.option("--configs", default=None, help="Comma-separated config names (default: all available configs)")
@click.option("--output_dir", default=".", help="Where to write job files")
@click.option("--skip_completed", is_flag=True, help="Skip jobs that already completed successfully in inpainting experiment")
def main(experiment_name, configs, output_dir, skip_completed):
"""Generate SLURM job files for inpainting all finished models from MLflow experiment on last 2 flu seasons"""
# Get finished models from MLflow
print(f"Querying MLflow experiment: {experiment_name}")
finished_models = get_finished_models(experiment_name)
if not finished_models:
print(f"No finished models found in experiment '{experiment_name}'")
return
print(f"Found {len(finished_models)} finished models")
# Define 3 flu seasons for scoring
flu_seasons = [
# {
# 'name': '2022-2023',
# 'start': '2022-10-17',
# 'end': '2023-05-15',
# 'freq': 'W-SAT'
# },
{
'name': '2023-2024',
'start': '2023-10-14',
'end': '2024-05-04',
'freq': '2W-SAT'
},
{
'name': '2024-2025',
'start': '2024-11-23',
'end': '2025-05-31',
'freq': '2W-SAT'
}
]
# Use all available configs if none specified
if configs is None:
from .config import AVAILABLE_COPAINT_CONFIGS
configs = ",".join(AVAILABLE_COPAINT_CONFIGS)
print(f"Using all available configs: {configs}")
config_names = [c.strip() for c in configs.split(",")]
print(f"Flu seasons: {[s['name'] for s in flu_seasons]}")
print(f"Configs: {config_names}")
# Generate all combinations: model × season × date × config
jobs = []
job_id = 0
for model in finished_models:
scenario_id = model['scenario_id']
run_id = model['run_id']
for season in flu_seasons:
forecast_dates = pd.date_range(season['start'], season['end'], freq=season['freq'])
for date in forecast_dates:
for config_name in config_names:
jobs.append({
'job_id': job_id,
'scenario_id': scenario_id,
'run_id': run_id,
'season': season['name'],
'date': date.strftime('%Y-%m-%d'),
'config': config_name
})
job_id += 1
print(f"Total jobs: {len(jobs)} = {len(finished_models)} models × {len(flu_seasons)} seasons × ~{len(forecast_dates)} dates × {len(config_names)} configs")
# Filter out completed jobs if requested
if skip_completed:
jobs = filter_completed_jobs(jobs, experiment_name)
print(f"After filtering completed jobs: {len(jobs)} remaining")
if not jobs:
print("No jobs remaining after filtering - all appear to be completed!")
return
# Write job list with run_id included
output_path = Path(output_dir)
experiment_basename = experiment_name.replace('_training', '')
job_list_file = output_path / f"inpaint_jobs_{experiment_basename}.txt"
with open(job_list_file, 'w') as f:
f.write("job_id,scenario_id,run_id,season,date,config\n")
for job in jobs:
f.write(f"{job['job_id']},{job['scenario_id']},{job['run_id']},{job['season']},{job['date']},{job['config']}\n")
print(f"Job list written to: {job_list_file}")
# Generate SLURM script
inpaint_exp = f"{experiment_basename}_inpainting"
slurm_script = f"""#!/bin/bash
#SBATCH -N 1
#SBATCH -n 1
#SBATCH --qos gpu_access
#SBATCH -p a100-gpu,l40-gpu,jlessler
#SBATCH --mem=32G
#SBATCH -t 00-04:00:00
#SBATCH --array=0-{len(jobs)-1}
#SBATCH --gres=gpu:1
module purge
# Experiment configuration
TRAINING_EXP="{experiment_name}"
INPAINT_EXP="{inpaint_exp}"
JOB_LIST="{job_list_file}"
echo "Inpainting job ${{SLURM_ARRAY_TASK_ID}}"
# Get job parameters from job list
JOB_LINE=$(sed -n "$((SLURM_ARRAY_TASK_ID + 2))p" $JOB_LIST) # +2 to skip header
SCENARIO_ID=$(echo $JOB_LINE | cut -d',' -f2)
RUN_ID=$(echo $JOB_LINE | cut -d',' -f3)
SEASON=$(echo $JOB_LINE | cut -d',' -f4)
DATE=$(echo $JOB_LINE | cut -d',' -f5)
CONFIG=$(echo $JOB_LINE | cut -d',' -f6)
echo "Scenario: ${{SCENARIO_ID}}, Run: ${{RUN_ID}}, Season: ${{SEASON}}, Date: ${{DATE}}, Config: ${{CONFIG}}"
# Run atomic inpainting with known run_id
/nas/longleaf/home/chadi/.conda/envs/diffusion_torch6/bin/python -u -m influpaint.batch.inpainting \\
-s ${{SCENARIO_ID}} \\
-r "${{RUN_ID}}" \\
-e "${{INPAINT_EXP}}" \\
--forecast_date "${{DATE}}" \\
--config_name "${{CONFIG}}" \\
> out_inpaint_s${{SCENARIO_ID}}_${{SEASON}}_${{DATE}}_${{CONFIG}}.out 2>&1
echo "Completed: Scenario ${{SCENARIO_ID}}, Season ${{SEASON}}, Date ${{DATE}}, Config ${{CONFIG}}"
"""
slurm_file = output_path / f"inpaint_array_{experiment_basename}.run"
with open(slurm_file, 'w') as f:
f.write(slurm_script)
# Make executable
slurm_file.chmod(0o755)
print(f"SLURM script written to: {slurm_file}")
print()
print("To submit:")
print(f" sbatch {slurm_file}")
print()
print("To monitor:")
print(f" squeue -u $USER")
print(f" ls out_inpaint_s*_*_*_*.out")
print()
print(f"Expected output files: out_inpaint_s<SCENARIO>_<SEASON>_<DATE>_<CONFIG>.out")
print(f"Example: out_inpaint_s5_2022-2023_2022-11-14_celebahq_try1.out")
print()
print("Model summary:")
for model in finished_models[:5]: # Show first 5
print(f" Scenario {model['scenario_id']}: {model['scenario_string']} (run: {model['run_id'][:8]}...)")
if len(finished_models) > 5:
print(f" ... and {len(finished_models) - 5} more models")