Created
February 5, 2019 15:41
-
-
Save timeu/8e76335e65c4391180615f5cdef5e435 to your computer and use it in GitHub Desktop.
Checkcluster script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Dazhe.Meng: | |
Nice little script to display job load on new gmi cluster (mendel) | |
''' | |
NUM_PER_ROW = 2 | |
import subprocess | |
import string | |
import sys | |
import re | |
MEM_REGEX = re.compile(r'^([\d]*)(mb|kb|gb)$') | |
class node(): | |
def __init__(self, name): | |
self.name = name | |
self.state = '' | |
self.ncpu = 0 #int(o[2]) | |
self.load = 0 #o[3] | |
self.tmem = 0 #o[4][:-1] | |
self.umem = 0 | |
self.users = [] | |
self.type = '' | |
class job(): | |
def __init__(self, name): | |
self.name = name | |
self.state = '' | |
self.ncpu = 0 #int(o[2]) | |
self.user = '' | |
self.where = '' | |
self.mpi = False | |
#class node(): | |
# def __init__(self, qhostl): | |
# o = qhostl.strip().split() | |
# self.name = o[0] | |
# self.ncpu = int(o[2]) | |
# if self.name.startswith('blade'): # hack to fix blade* number of cpu | |
# self.ncpu = 8 | |
# self.load = o[3] | |
# self.tmem = o[4][:-1] | |
# if o[5]!='-': | |
# if o[5][-1]!='G': | |
# self.umem = '{0:.1}'.format(float(o[5][:-1])/1000) | |
# else: | |
# self.umem = o[5][:-1] | |
# else: | |
# self.umem='-' | |
# self.users = [] | |
class user(): | |
def __init__(self, name): | |
self.name = name | |
self.running = 0 | |
self.queueing = 0 | |
Nodes = {} | |
Users = {} | |
Userids = [] | |
d_id = {} | |
d_jobs = {} | |
def choose_userid(username): | |
if username[0] not in Userids: | |
newid = username[0] | |
elif username[0].upper() not in Userids: | |
newid = username[0].upper() | |
else: | |
for tryid in string.digits+string.ascii_letters: | |
if tryid not in Userids: | |
newid = tryid | |
d_id[username]=newid | |
Userids.append(newid) | |
return newid | |
def _get_mem_from_str(mem_str): | |
match = re.match(MEM_REGEX,mem_str.strip()) | |
mem = int(match.group(1)) | |
if match.group(2) == 'gb': | |
return mem | |
elif match.group(2) == 'mb': | |
return mem/1024 | |
return mem/1048576 | |
Qhost = subprocess.Popen(["pbsnodes","-a"],stdout=subprocess.PIPE) | |
Qhostout = Qhost.communicate()[0].strip() | |
Qstat = subprocess.Popen("qstat -a",stdout=subprocess.PIPE, shell=True) | |
Qstatout = Qstat.communicate()[0].strip() | |
Qstatf = subprocess.Popen("qstat -ft",stdout=subprocess.PIPE, shell=True) | |
Qstatfout = Qstatf.communicate()[0].strip() | |
def read_qhost(): | |
' newly created function to read mendel output ' | |
newnode = None | |
for l in Qhostout.split("\n")+['\n ']: | |
if not l.startswith(' '): | |
if newnode != None: | |
if newnode.type == 'PBS':# and not newnode.name.startswith('dmn'): | |
Nodes[newnode.name]=newnode | |
if l.strip(): # have something | |
newnode = node(l.strip()) | |
else: # means dataentries | |
l = l.strip() | |
if l.startswith('state'): | |
newnode.state = l.split('=')[1].strip() | |
if l.startswith('ntype'): | |
newnode.type = l.split('=')[1].strip() | |
if l.startswith('resources_available.ncpus'): | |
newnode.ncpu = int(l.split('=')[1]) | |
if l.startswith('resources_assigned.ncpus'): | |
newnode.load = int(l.split('=')[1]) | |
if l.startswith('resources_available.mem'): | |
newnode.tmem = _get_mem_from_str(l.split('=')[1]) | |
if l.startswith('resources_assigned.mem'): | |
newnode.umem = _get_mem_from_str(l.split('=')[1]) | |
#newnode = node(l) | |
#Nodes[newnode.name]=newnode | |
#for l in Qstatout.split("\n")[2:]: | |
# o = l.strip().split() | |
# username = o[1] | |
# status = o[8] | |
# if username not in Users: | |
# Users[username]=user(username) | |
# choose_userid(username) | |
# if status == 'R': | |
# #nodeused = o[7].split("@")[1] | |
# num_slots = int(o[5]) | |
# #Nodes[nodeused].users+=[username]*num_slots | |
# Users[username].running += 1 | |
# else: | |
# Users[username].queueing += 1 | |
# WORKAROUND FOR UVM error | |
UV_REGEX = re.compile(r'^(uv2000p0)\[\d\]$') | |
newjob = None | |
mpihostl = '' | |
for l in Qstatfout.split("\n")+['Job Id:fake\n']: | |
if l.startswith('Job Id'): | |
if newjob != None: | |
if newjob.user not in Users: | |
Users[newjob.user]=user(newjob.user) | |
choose_userid(newjob.user) | |
if newjob.state == 'R': | |
if not newjob.mpi: | |
where = newjob.where | |
uv_match = re.match(UV_REGEX,where) | |
if uv_match: | |
where = uv_match.group(1) | |
Nodes[where].users+=[newjob.user]*newjob.ncpu | |
Users[newjob.user].running += 1 | |
else: | |
Users[newjob.user].queueing += 1 | |
newjob = job(l.split(':')[1].strip()) | |
else: # means dataentries | |
l = l.strip() | |
if l.startswith('job_state'): | |
newjob.state = l.split('=')[1].strip() | |
if l.startswith('Job_Owner'): | |
newjob.user = l.split('=')[1].strip().split('@')[0] | |
if newjob.mpi == True and l.startswith('exec_vnode'): # end of mpi host lines | |
mpihosts = mpihostl.split('+') | |
for i, mpihost in enumerate(mpihosts): | |
mpiwhere = mpihost.split('/')[0] | |
if '*' in mpihost: | |
mpincpu = int(mpihost.split('*')[1]) | |
else: | |
mpincpu = 1 | |
Nodes[mpiwhere].users+=[newjob.user]*mpincpu | |
mpihostl = '' | |
if newjob.mpi == True and mpihostl!='': # append to mpi line | |
mpihostl += l.strip() | |
if l.startswith('exec_host'): | |
if newjob.state == 'R': | |
if '+' in l:#mpi jobs... GOD | |
newjob.mpi = True | |
mpihostl = l.split('=')[1].strip() | |
else: # non-mpi | |
newjob.where = l.split('=')[1].strip().split('/')[0] # maybe works | |
if l.startswith('resources_used.ncpus') and not newjob.mpi: | |
newjob.ncpu = int(l.split('=')[1]) | |
ostr= [] | |
l_n = Nodes.keys() | |
l_n.sort() | |
for nn in l_n: | |
n = Nodes[nn] | |
n.users.sort() | |
usage_str = '[' | |
for u in n.users: | |
usage_str+=d_id[u] | |
if n.state == 'offline': | |
usage_str+='*'*n.ncpu+']' | |
else: | |
usage_str+='?'*(n.load - len(n.users))+'-'*(min(n.ncpu-n.load,n.ncpu-len(n.users)))+']' | |
ostr.append('{0:<8}{1:>5}/{2:<3}{3:<26}{4:>4}/{5:<4}'.format(n.name, n.load, n.ncpu, usage_str, n.umem, str(n.tmem)+'G')) | |
print "" | |
so = "" | |
for i, s in enumerate(ostr): | |
so+=s | |
if (i+1)%NUM_PER_ROW == 0: | |
print so # Werid bug here! sometimes not printing | |
so = "" | |
else: | |
so += " " | |
print so | |
print "" | |
print "*: node offline" | |
print "?: i'm not sure who is using it, but cpus not free!" | |
for uu in Users: | |
u = Users[uu] | |
print "{0}: {1:<20}{2:>4} job(s) running, {3:>4} job(s) queueing or otherwise".format(d_id[u.name],u.name, u.running, u.queueing) | |
print "" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment