accounting_troubleshooting
Differences
This shows you the differences between two versions of the page.
| Next revision | Previous revision | ||
| accounting_troubleshooting [2026/02/02 16:13] – created bbruzzo | accounting_troubleshooting [2026/02/06 17:15] (current) – bbruzzo | ||
|---|---|---|---|
| Line 14: | Line 14: | ||
| MaxTRESPJ= | MaxTRESPJ= | ||
| </ | </ | ||
| + | |||
| + | ====== Revisión de Database ====== | ||
| + | |||
| + | Para hacer un dump de la database, desde mmgt02: | ||
| + | |||
| + | < | ||
| + | |||
| + | ====== Parsear desde sacct ====== | ||
| + | < | ||
| + | |||
| + | ====== Python Script reporte horas ====== | ||
| + | |||
| + | < | ||
| + | # | ||
| + | |||
| + | import subprocess | ||
| + | |||
| + | def get_accounts(): | ||
| + | command = [' | ||
| + | output = subprocess.run(command, | ||
| + | accounts = output.stdout.split() | ||
| + | return accounts | ||
| + | |||
| + | def get_hours(account): | ||
| + | command = [' | ||
| + | pipe_command= [' | ||
| + | |||
| + | proc = subprocess.Popen(command, | ||
| + | pipe_proc = subprocess.Popen(pipe_command, | ||
| + | stdout, | ||
| + | |||
| + | print(account) | ||
| + | print(stdout) | ||
| + | |||
| + | if __name__ == ' | ||
| + | accounts = get_accounts() | ||
| + | |||
| + | for account in accounts: | ||
| + | if account.startswith((' | ||
| + | get_hours(account) | ||
| + | </ | ||
| + | |||
| + | ====== Explorando con una QOS de prueba ====== | ||
| + | |||
| + | Quiero corroborar que a pesar de que assoc_mgr ya no reporte horas de qos, no ocurra que las qos igual tengan sus recursos usados registrados, | ||
| + | |||
| + | Tenemos la qosprueba. | ||
| + | |||
| + | < | ||
| + | Name Flags GrpTRESMins | ||
| + | ---------- -------------------- ------------------------------ | ||
| + | | ||
| + | |||
| + | La misma figura con 1100 minutos CPU. | ||
| + | |||
| + | < | ||
| + | 1100.68 | ||
| + | </ | ||
| + | |||
| + | Son jobs que se utilizaron con los usuarios ' | ||
| + | |||
| + | < | ||
| + | JobID | ||
| + | ------------ ------------------- ---------- --------- ---------- ---------- ---------- ---------- ---------- | ||
| + | 604 2025-06-18T15: | ||
| + | 605 2025-06-18T15: | ||
| + | 609 2025-06-18T15: | ||
| + | 610 2025-06-18T16: | ||
| + | 624 2025-06-19T15: | ||
| + | 626 2025-06-19T16: | ||
| + | 628 2025-06-19T16: | ||
| + | 629 2025-06-19T16: | ||
| + | 630 2025-06-19T16: | ||
| + | 631 2025-06-19T16: | ||
| + | 632 2025-06-19T16: | ||
| + | 633 2025-06-19T16: | ||
| + | 637 2025-06-23T15: | ||
| + | 638 2025-06-23T15: | ||
| + | 639 2025-06-23T15: | ||
| + | 640 2025-06-23T15: | ||
| + | 970 2025-07-07T18: | ||
| + | 971 2025-07-07T18: | ||
| + | 972 2025-07-07T18: | ||
| + | 973 2025-07-07T18: | ||
| + | 974 2025-07-07T18: | ||
| + | 975 2025-07-07T18: | ||
| + | 983 2025-07-08T13: | ||
| + | 984 2025-07-08T13: | ||
| + | 985 2025-07-08T13: | ||
| + | 986 2025-07-08T13: | ||
| + | 987 2025-07-08T13: | ||
| + | 988 2025-07-08T13: | ||
| + | 1083 | ||
| + | 1084 | ||
| + | 1085 | ||
| + | 1086 | ||
| + | 1088 | ||
| + | 1090 | ||
| + | 1091 | ||
| + | 1092 | ||
| + | 1093 | ||
| + | 1139 | ||
| + | 1141 | ||
| + | 1158 | ||
| + | 1159 | ||
| + | 1167 | ||
| + | 1168 | ||
| + | </ | ||
| + | |||
| + | La cuenta ' | ||
| + | |||
| + | < | ||
| + | | ||
| + | ---------- -------------------- -------------------- | ||
| + | </ | ||
| + | |||
| + | Pero podemos asociar la qos a otra account para probarla. | ||
| + | |||
| + | Antes de hacer nada verificamos que la qos reporta 0 horas de uso con assoc_mgr. | ||
| + | |||
| + | < | ||
| + | QOS=qosprueba(32) | ||
| + | UsageRaw=0.000000 | ||
| + | GrpJobs=N(0) GrpJobsAccrue=N(0) GrpSubmitJobs=N(0) GrpWall=N(0.00) | ||
| + | GrpTRES=cpu=N(0), | ||
| + | GrpTRESMins=cpu=3000(0), | ||
| + | GrpTRESRunMins=cpu=N(0), | ||
| + | MaxWallPJ= | ||
| + | MaxTRESPJ= | ||
| + | </ | ||
| + | |||
| + | Lo mismo podemos ver con scontrol show assoc. | ||
| + | |||
| + | < | ||
| + | QOS=qosprueba(32) | ||
| + | UsageRaw=0.000000 | ||
| + | GrpJobs=N(0) GrpJobsAccrue=N(0) GrpSubmitJobs=N(0) GrpWall=N(0.00) | ||
| + | GrpTRES=cpu=N(0), | ||
| + | GrpTRESMins=cpu=3000(0), | ||
| + | GrpTRESRunMins=cpu=N(0), | ||
| + | MaxWallPJ= | ||
| + | MaxTRESPJ= | ||
| + | MaxTRESPN= | ||
| + | MaxTRESMinsPJ= | ||
| + | MinPrioThresh= | ||
| + | </ | ||
| + | |||
| + | Como vemos, indica: | ||
| + | |||
| + | < | ||
| + | |||
| + | En efecto, pareceria que hay 3000 minutos disponibles, | ||
| + | Queremos comprobar si hay 3000 minutos disponibles, | ||
| + | |||
| + | Asignamos la qos a una cuenta de root. | ||
| + | |||
| + | < | ||
| + | # sacctmgr show assoc user=root WithRawQOSLevel format=cluster, | ||
| + | | ||
| + | ---------- ---------- ---------- --------- ------------- -------------------- ---------- | ||
| + | clementina | ||
| + | |||
| + | |||
| + | $ sacctmgr modify user root set qos+=qosprueba | ||
| + | | ||
| + | C = clementina A = root U = root | ||
| + | Would you like to commit changes? (You have 30 seconds to decide) | ||
| + | (N/y): y | ||
| + | |||
| + | |||
| + | $ sacctmgr show assoc user=root WithRawQOSLevel format=cluster, | ||
| + | | ||
| + | ---------- ---------- ---------- --------- ------------- -------------------- ---------- | ||
| + | clementina | ||
| + | |||
| + | </ | ||
| + | |||
| + | Enviemos un job de 2000 minutos a ver que ocurre. | ||
| + | |||
| + | < | ||
| + | #SBATCH --job-name=sleep | ||
| + | #SBATCH --ntasks=12 | ||
| + | #SBATCH --partition=gpunode | ||
| + | #SBATCH --qos=qosprueba | ||
| + | #SBATCH --time=200 #200 minutos en 12 cores son 2400 minutos. | ||
| + | |||
| + | sleep 60000 # el job debería hacer timeout por la directiva --time. | ||
| + | </ | ||
| + | |||
| + | A priori me permitió encolar el job: | ||
| + | |||
| + | < | ||
| + | Submitted batch job 281155 | ||
| + | [root@mmgt02 slurm]# squeue --me | ||
| + | JOBID PARTITION | ||
| + | 281155 | ||
| + | </ | ||
| + | |||
| + | El job muestra como qos asignada qosprueba. | ||
| + | |||
| + | < | ||
| + | [root@mmgt02 slurm]# scontrol show job 281155 | ||
| + | JobId=281155 JobName=sleep | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | </ | ||
| + | |||
| + | El job terminó de correr, osea usó más tiempo del que podía: | ||
| + | |||
| + | < | ||
| + | JobID | ||
| + | ------------ ---------- ---------- ---------- ---------- -------- ---------- ---------- | ||
| + | 281155 | ||
| + | </ | ||
| + | |||
| + | |||
| + | < | ||
| + | QOS=qosprueba(32) | ||
| + | UsageRaw=144288.000000 | ||
| + | GrpJobs=N(0) GrpJobsAccrue=N(0) GrpSubmitJobs=N(0) GrpWall=N(200.40) | ||
| + | GrpTRES=cpu=N(0), | ||
| + | GrpTRESMins=cpu=3000(2404), | ||
| + | GrpTRESRunMins=cpu=N(0), | ||
| + | MaxWallPJ= | ||
| + | MaxTRESPJ= | ||
| + | MaxTRESPN= | ||
| + | MaxTRESMinsPJ= | ||
| + | MinPrioThresh= | ||
| + | MinTRESPJ= | ||
| + | PreemptMode=OFF | ||
| + | Priority=0 | ||
| + | Account Limits | ||
| + | root | ||
| + | MaxJobsPA=N(0) MaxJobsAccruePA=N(0) MaxSubmitJobsPA=N(0) | ||
| + | MaxTRESPA=cpu=N(0), | ||
| + | User Limits | ||
| + | root(0) | ||
| + | MaxJobsPU=N(0) MaxJobsAccruePU=N(0) MaxSubmitJobsPU=N(0) | ||
| + | MaxTRESPU=cpu=N(0), | ||
| + | </ | ||
| + | |||
| + | Ahora muestra que la qos tiene 3000 horas disponibles pero usó 3500. | ||
| + | |||
| + | < | ||
| + | 3505.48 | ||
| + | </ | ||
| + | |||
| + | < | ||
| + | UsageRaw=144288.000000 | ||
| + | GrpJobs=N(0) GrpJobsAccrue=N(0) GrpSubmitJobs=N(0) GrpWall=N(200.40) | ||
| + | GrpTRES=cpu=N(0), | ||
| + | GrpTRESMins=cpu=3000(2404), | ||
| + | GrpTRESRunMins=cpu=N(0), | ||
| + | </ | ||
| + | |||
| + | ¿Vuelve a entrar el mismo job? | ||
| + | |||
| + | Si, vuelve a correr: | ||
| + | |||
| + | < | ||
| + | JobID | ||
| + | ------------ ---------- ---------- ---------- ---------- ---------- ---------- | ||
| + | 281155 | ||
| + | 282169 | ||
| + | </ | ||
| + | |||
| + | < | ||
| + | 4141.28 | ||
| + | </ | ||
| + | |||
| + | Vemos que la qos tiene consumidos 4141 minutos. | ||
| + | |||
| + | Cree que corrió 3040 minutos: | ||
| + | |||
| + | < | ||
| + | QOS=qosprueba(32) | ||
| + | UsageRaw=182436.000000 | ||
| + | GrpJobs=N(0) GrpJobsAccrue=N(0) GrpSubmitJobs=N(0) GrpWall=N(253.38) | ||
| + | GrpTRES=cpu=N(0), | ||
| + | GrpTRESMins=cpu=3000(3040), | ||
| + | GrpTRESRunMins=cpu=N(0), | ||
| + | MaxWallPJ= | ||
| + | MaxTRESPJ= | ||
| + | </ | ||
| + | |||
| + | |||
| + | ====== DefaultQOS ====== | ||
| + | |||
| + | Previo al update las accounts tenían una QOS por default (o no?). | ||
| + | |||
| + | Hay que actualizarlo para que contabilice las horas correctamente: | ||
| + | |||
| + | < | ||
| + | |||
| + | |||
| + | ====== FIX ====== | ||
| + | |||
| + | Utilicé este script para actualizar las qos a los valores restantes y setear las qos default: | ||
| + | |||
| + | <code python fix_qos.py># | ||
| + | |||
| + | import subprocess | ||
| + | |||
| + | def get_accounts(): | ||
| + | command = [' | ||
| + | output = subprocess.run(command, | ||
| + | accounts = output.stdout.split() | ||
| + | return accounts | ||
| + | |||
| + | def get_hours(account): | ||
| + | command = [' | ||
| + | ' | ||
| + | ' | ||
| + | cpu_hours_command = [' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | gpu_hours_command = [' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | |||
| + | cpu_hours = subprocess.run( | ||
| + | cpu_hours_command, | ||
| + | capture_output=True, | ||
| + | encoding=' | ||
| + | | ||
| + | cpu_hours = [int(i) for i in cpu_hours] | ||
| + | cpu_hours = sum(cpu_hours) | ||
| + | |||
| + | gpu_hours = subprocess.run( | ||
| + | gpu_hours_command, | ||
| + | capture_output=True, | ||
| + | encoding=' | ||
| + | |||
| + | gpu_hours = [int(i) for i in gpu_hours] | ||
| + | gpu_hours = sum(gpu_hours) | ||
| + | | ||
| + | return cpu_hours, gpu_hours | ||
| + | |||
| + | def update_qos(account): | ||
| + | match account: | ||
| + | case a if a.startswith(' | ||
| + | default_cpu, | ||
| + | case a if a.startswith(' | ||
| + | default_cpu, | ||
| + | case a if a.startswith(' | ||
| + | default_cpu, | ||
| + | |||
| + | spent_cpu, spent_gpu = get_hours(account) | ||
| + | |||
| + | new_cpu = max(0, | ||
| + | new_gpu = max(0, | ||
| + | | ||
| + | update_command = [' | ||
| + | ' | ||
| + | |||
| + | print(update_command) | ||
| + | subprocess.run(update_command) | ||
| + | |||
| + | def update_defqos(account): | ||
| + | ''' | ||
| + | Sets Default QOS to IPAC project in case of misconfiguration. | ||
| + | ''' | ||
| + | command = [' | ||
| + | ' | ||
| + | subprocess.run(command) | ||
| + | |||
| + | def action(accounts, | ||
| + | for account in accounts: | ||
| + | if account.startswith((' | ||
| + | func(account) | ||
| + | |||
| + | actions = { | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | } | ||
| + | |||
| + | if __name__ == ' | ||
| + | print(" | ||
| + | "2) update_defqos\n"," | ||
| + | choice = input(" | ||
| + | func = actions.get(choice) | ||
| + | |||
| + | accounts = get_accounts() | ||
| + | if func: | ||
| + | action(accounts, | ||
| + | else: | ||
| + | print(" | ||
| + | </ | ||
| + | |||
accounting_troubleshooting.1770048814.txt.gz · Last modified: by bbruzzo
