accounting_troubleshooting
Differences
This shows you the differences between two versions of the page.
| Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
| accounting_troubleshooting [2026/02/04 14:21] – QOS de prueba bbruzzo | accounting_troubleshooting [2026/02/06 17:15] (current) – bbruzzo | ||
|---|---|---|---|
| Line 48: | Line 48: | ||
| print(stdout) | print(stdout) | ||
| - | if __name__ == ' | + | if __name__ == ' |
| accounts = get_accounts() | accounts = get_accounts() | ||
| Line 245: | Line 245: | ||
| </ | </ | ||
| - | To be continued... | + | El job terminó de correr, osea usó más tiempo del que podía: |
| + | |||
| + | < | ||
| + | JobID | ||
| + | ------------ ---------- ---------- ---------- ---------- -------- ---------- ---------- | ||
| + | 281155 | ||
| + | </ | ||
| + | |||
| + | |||
| + | < | ||
| + | QOS=qosprueba(32) | ||
| + | UsageRaw=144288.000000 | ||
| + | GrpJobs=N(0) GrpJobsAccrue=N(0) GrpSubmitJobs=N(0) GrpWall=N(200.40) | ||
| + | GrpTRES=cpu=N(0), | ||
| + | GrpTRESMins=cpu=3000(2404), | ||
| + | GrpTRESRunMins=cpu=N(0), | ||
| + | MaxWallPJ= | ||
| + | MaxTRESPJ= | ||
| + | MaxTRESPN= | ||
| + | MaxTRESMinsPJ= | ||
| + | MinPrioThresh= | ||
| + | MinTRESPJ= | ||
| + | PreemptMode=OFF | ||
| + | Priority=0 | ||
| + | Account Limits | ||
| + | root | ||
| + | MaxJobsPA=N(0) MaxJobsAccruePA=N(0) MaxSubmitJobsPA=N(0) | ||
| + | MaxTRESPA=cpu=N(0), | ||
| + | User Limits | ||
| + | root(0) | ||
| + | MaxJobsPU=N(0) MaxJobsAccruePU=N(0) MaxSubmitJobsPU=N(0) | ||
| + | MaxTRESPU=cpu=N(0), | ||
| + | </ | ||
| + | |||
| + | Ahora muestra que la qos tiene 3000 horas disponibles pero usó 3500. | ||
| + | |||
| + | < | ||
| + | 3505.48 | ||
| + | </ | ||
| + | |||
| + | < | ||
| + | UsageRaw=144288.000000 | ||
| + | GrpJobs=N(0) GrpJobsAccrue=N(0) GrpSubmitJobs=N(0) GrpWall=N(200.40) | ||
| + | GrpTRES=cpu=N(0), | ||
| + | GrpTRESMins=cpu=3000(2404), | ||
| + | GrpTRESRunMins=cpu=N(0), | ||
| + | </ | ||
| + | |||
| + | ¿Vuelve a entrar el mismo job? | ||
| + | |||
| + | Si, vuelve a correr: | ||
| + | |||
| + | < | ||
| + | JobID | ||
| + | ------------ ---------- ---------- ---------- ---------- ---------- ---------- | ||
| + | 281155 | ||
| + | 282169 | ||
| + | </ | ||
| + | |||
| + | < | ||
| + | 4141.28 | ||
| + | </ | ||
| + | |||
| + | Vemos que la qos tiene consumidos 4141 minutos. | ||
| + | |||
| + | Cree que corrió 3040 minutos: | ||
| + | |||
| + | < | ||
| + | QOS=qosprueba(32) | ||
| + | UsageRaw=182436.000000 | ||
| + | GrpJobs=N(0) GrpJobsAccrue=N(0) GrpSubmitJobs=N(0) GrpWall=N(253.38) | ||
| + | GrpTRES=cpu=N(0), | ||
| + | GrpTRESMins=cpu=3000(3040), | ||
| + | GrpTRESRunMins=cpu=N(0), | ||
| + | MaxWallPJ= | ||
| + | MaxTRESPJ= | ||
| + | </ | ||
| + | |||
| + | |||
| + | ====== DefaultQOS ====== | ||
| + | |||
| + | Previo al update las accounts tenían una QOS por default (o no?). | ||
| + | |||
| + | Hay que actualizarlo para que contabilice las horas correctamente: | ||
| + | |||
| + | < | ||
| + | |||
| + | |||
| + | ====== FIX ====== | ||
| + | |||
| + | Utilicé este script para actualizar las qos a los valores restantes y setear las qos default: | ||
| + | |||
| + | <code python fix_qos.py># | ||
| + | |||
| + | import subprocess | ||
| + | |||
| + | def get_accounts(): | ||
| + | command = [' | ||
| + | output = subprocess.run(command, | ||
| + | accounts = output.stdout.split() | ||
| + | return accounts | ||
| + | |||
| + | def get_hours(account): | ||
| + | command = [' | ||
| + | ' | ||
| + | ' | ||
| + | cpu_hours_command = [' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | gpu_hours_command = [' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | |||
| + | cpu_hours = subprocess.run( | ||
| + | cpu_hours_command, | ||
| + | capture_output=True, | ||
| + | encoding=' | ||
| + | |||
| + | cpu_hours = [int(i) for i in cpu_hours] | ||
| + | cpu_hours = sum(cpu_hours) | ||
| + | |||
| + | gpu_hours = subprocess.run( | ||
| + | gpu_hours_command, | ||
| + | capture_output=True, | ||
| + | encoding=' | ||
| + | |||
| + | gpu_hours = [int(i) for i in gpu_hours] | ||
| + | gpu_hours = sum(gpu_hours) | ||
| + | |||
| + | return cpu_hours, gpu_hours | ||
| + | |||
| + | def update_qos(account): | ||
| + | match account: | ||
| + | case a if a.startswith(' | ||
| + | default_cpu, | ||
| + | case a if a.startswith(' | ||
| + | default_cpu, | ||
| + | case a if a.startswith(' | ||
| + | default_cpu, | ||
| + | |||
| + | spent_cpu, spent_gpu = get_hours(account) | ||
| + | |||
| + | new_cpu = max(0, | ||
| + | new_gpu = max(0, | ||
| + | |||
| + | update_command = [' | ||
| + | ' | ||
| + | |||
| + | print(update_command) | ||
| + | subprocess.run(update_command) | ||
| + | |||
| + | def update_defqos(account): | ||
| + | ''' | ||
| + | Sets Default QOS to IPAC project in case of misconfiguration. | ||
| + | ''' | ||
| + | command = [' | ||
| + | ' | ||
| + | subprocess.run(command) | ||
| + | |||
| + | def action(accounts, | ||
| + | for account in accounts: | ||
| + | if account.startswith((' | ||
| + | func(account) | ||
| + | |||
| + | actions = { | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | } | ||
| + | |||
| + | if __name__ == ' | ||
| + | print(" | ||
| + | "2) update_defqos\n"," | ||
| + | choice = input(" | ||
| + | func = actions.get(choice) | ||
| + | |||
| + | accounts = get_accounts() | ||
| + | if func: | ||
| + | action(accounts, | ||
| + | else: | ||
| + | print(" | ||
| + | </ | ||
accounting_troubleshooting.1770214875.txt.gz · Last modified: by bbruzzo
