Skip to content

Instantly share code, notes, and snippets.

@timeu
Created February 5, 2019 15:41
Show Gist options
  • Save timeu/8e76335e65c4391180615f5cdef5e435 to your computer and use it in GitHub Desktop.
Save timeu/8e76335e65c4391180615f5cdef5e435 to your computer and use it in GitHub Desktop.
Checkcluster script
'''
Dazhe.Meng:
Nice little script to display job load on new gmi cluster (mendel)
'''
NUM_PER_ROW = 2
import subprocess
import string
import sys
import re
MEM_REGEX = re.compile(r'^([\d]*)(mb|kb|gb)$')
class node():
def __init__(self, name):
self.name = name
self.state = ''
self.ncpu = 0 #int(o[2])
self.load = 0 #o[3]
self.tmem = 0 #o[4][:-1]
self.umem = 0
self.users = []
self.type = ''
class job():
def __init__(self, name):
self.name = name
self.state = ''
self.ncpu = 0 #int(o[2])
self.user = ''
self.where = ''
self.mpi = False
#class node():
# def __init__(self, qhostl):
# o = qhostl.strip().split()
# self.name = o[0]
# self.ncpu = int(o[2])
# if self.name.startswith('blade'): # hack to fix blade* number of cpu
# self.ncpu = 8
# self.load = o[3]
# self.tmem = o[4][:-1]
# if o[5]!='-':
# if o[5][-1]!='G':
# self.umem = '{0:.1}'.format(float(o[5][:-1])/1000)
# else:
# self.umem = o[5][:-1]
# else:
# self.umem='-'
# self.users = []
class user():
def __init__(self, name):
self.name = name
self.running = 0
self.queueing = 0
Nodes = {}
Users = {}
Userids = []
d_id = {}
d_jobs = {}
def choose_userid(username):
if username[0] not in Userids:
newid = username[0]
elif username[0].upper() not in Userids:
newid = username[0].upper()
else:
for tryid in string.digits+string.ascii_letters:
if tryid not in Userids:
newid = tryid
d_id[username]=newid
Userids.append(newid)
return newid
def _get_mem_from_str(mem_str):
match = re.match(MEM_REGEX,mem_str.strip())
mem = int(match.group(1))
if match.group(2) == 'gb':
return mem
elif match.group(2) == 'mb':
return mem/1024
return mem/1048576
Qhost = subprocess.Popen(["pbsnodes","-a"],stdout=subprocess.PIPE)
Qhostout = Qhost.communicate()[0].strip()
Qstat = subprocess.Popen("qstat -a",stdout=subprocess.PIPE, shell=True)
Qstatout = Qstat.communicate()[0].strip()
Qstatf = subprocess.Popen("qstat -ft",stdout=subprocess.PIPE, shell=True)
Qstatfout = Qstatf.communicate()[0].strip()
def read_qhost():
' newly created function to read mendel output '
newnode = None
for l in Qhostout.split("\n")+['\n ']:
if not l.startswith(' '):
if newnode != None:
if newnode.type == 'PBS':# and not newnode.name.startswith('dmn'):
Nodes[newnode.name]=newnode
if l.strip(): # have something
newnode = node(l.strip())
else: # means dataentries
l = l.strip()
if l.startswith('state'):
newnode.state = l.split('=')[1].strip()
if l.startswith('ntype'):
newnode.type = l.split('=')[1].strip()
if l.startswith('resources_available.ncpus'):
newnode.ncpu = int(l.split('=')[1])
if l.startswith('resources_assigned.ncpus'):
newnode.load = int(l.split('=')[1])
if l.startswith('resources_available.mem'):
newnode.tmem = _get_mem_from_str(l.split('=')[1])
if l.startswith('resources_assigned.mem'):
newnode.umem = _get_mem_from_str(l.split('=')[1])
#newnode = node(l)
#Nodes[newnode.name]=newnode
#for l in Qstatout.split("\n")[2:]:
# o = l.strip().split()
# username = o[1]
# status = o[8]
# if username not in Users:
# Users[username]=user(username)
# choose_userid(username)
# if status == 'R':
# #nodeused = o[7].split("@")[1]
# num_slots = int(o[5])
# #Nodes[nodeused].users+=[username]*num_slots
# Users[username].running += 1
# else:
# Users[username].queueing += 1
# WORKAROUND FOR UVM error
UV_REGEX = re.compile(r'^(uv2000p0)\[\d\]$')
newjob = None
mpihostl = ''
for l in Qstatfout.split("\n")+['Job Id:fake\n']:
if l.startswith('Job Id'):
if newjob != None:
if newjob.user not in Users:
Users[newjob.user]=user(newjob.user)
choose_userid(newjob.user)
if newjob.state == 'R':
if not newjob.mpi:
where = newjob.where
uv_match = re.match(UV_REGEX,where)
if uv_match:
where = uv_match.group(1)
Nodes[where].users+=[newjob.user]*newjob.ncpu
Users[newjob.user].running += 1
else:
Users[newjob.user].queueing += 1
newjob = job(l.split(':')[1].strip())
else: # means dataentries
l = l.strip()
if l.startswith('job_state'):
newjob.state = l.split('=')[1].strip()
if l.startswith('Job_Owner'):
newjob.user = l.split('=')[1].strip().split('@')[0]
if newjob.mpi == True and l.startswith('exec_vnode'): # end of mpi host lines
mpihosts = mpihostl.split('+')
for i, mpihost in enumerate(mpihosts):
mpiwhere = mpihost.split('/')[0]
if '*' in mpihost:
mpincpu = int(mpihost.split('*')[1])
else:
mpincpu = 1
Nodes[mpiwhere].users+=[newjob.user]*mpincpu
mpihostl = ''
if newjob.mpi == True and mpihostl!='': # append to mpi line
mpihostl += l.strip()
if l.startswith('exec_host'):
if newjob.state == 'R':
if '+' in l:#mpi jobs... GOD
newjob.mpi = True
mpihostl = l.split('=')[1].strip()
else: # non-mpi
newjob.where = l.split('=')[1].strip().split('/')[0] # maybe works
if l.startswith('resources_used.ncpus') and not newjob.mpi:
newjob.ncpu = int(l.split('=')[1])
ostr= []
l_n = Nodes.keys()
l_n.sort()
for nn in l_n:
n = Nodes[nn]
n.users.sort()
usage_str = '['
for u in n.users:
usage_str+=d_id[u]
if n.state == 'offline':
usage_str+='*'*n.ncpu+']'
else:
usage_str+='?'*(n.load - len(n.users))+'-'*(min(n.ncpu-n.load,n.ncpu-len(n.users)))+']'
ostr.append('{0:<8}{1:>5}/{2:<3}{3:<26}{4:>4}/{5:<4}'.format(n.name, n.load, n.ncpu, usage_str, n.umem, str(n.tmem)+'G'))
print ""
so = ""
for i, s in enumerate(ostr):
so+=s
if (i+1)%NUM_PER_ROW == 0:
print so # Werid bug here! sometimes not printing
so = ""
else:
so += " "
print so
print ""
print "*: node offline"
print "?: i'm not sure who is using it, but cpus not free!"
for uu in Users:
u = Users[uu]
print "{0}: {1:<20}{2:>4} job(s) running, {3:>4} job(s) queueing or otherwise".format(d_id[u.name],u.name, u.running, u.queueing)
print ""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment