Skip to content

Instantly share code, notes, and snippets.

@ian-whitestone
Created September 29, 2020 02:03
Show Gist options
  • Save ian-whitestone/d3b876e77743923b112d7d004d86480c to your computer and use it in GitHub Desktop.
Save ian-whitestone/d3b876e77743923b112d7d004d86480c to your computer and use it in GitHub Desktop.
Code snippets for single node Dask cluster on GCP blog post
gcloud compute instances create-with-container dask-cluster-instance \
--zone=us-central1-a \
--machine-type=e2-highcpu-16 \
--tags=http-server,https-server \
--container-env=MEMORY_PER_WORKER=1,THREADS_PER_WORKER=1 \
--container-image=registry.hub.docker.com/ianwhitestone/domi-dask:latest
#!/bin/bash
set -x
if [ "$NUM_WORKERS" ]; then
echo "NUM_WORKERS environment variable found. Setting number of workers to $NUM_WORKERS."
num_workers=$NUM_WORKERS
else
num_cores=$(poetry run python -c "import multiprocessing; print(multiprocessing.cpu_count())")
echo "NUM_WORKERS environment variable NOT found. Defaulting to $num_cores workers"
num_workers=$num_cores
fi
if [ "$SCHEDULER_MEMORY" ]; then
echo "SCHEDULER_MEMORY environment variable found. Setting scheduler memory to $SCHEDULER_MEMORY GB"
scheduler_memory=$SCHEDULER_MEMORY
else
echo "SCHEDULER_MEMORY environment variable NOT found. Defaulting to 2GB"
scheduler_memory=2
fi
if [ "$MEMORY_PER_WORKER" ]; then
echo "MEMORY_PER_WORKER environment variable found. Setting memory per work to $MEMORY_PER_WORKER."
memory_per_worker=$MEMORY_PER_WORKER
else
total_memory=$(poetry run python -c "from distributed.system import memory_limit; print (memory_limit()/1e9)")
echo "MEMORY_PER_WORKER environment variable NOT found. "\
"Defaulting to ($total_memory - $scheduler_memory)/$num_cores GB per worker"
available_worker_memory=$(echo "$total_memory - $scheduler_memory" | bc -l)
memory_per_worker=$(echo "$available_worker_memory/$num_cores" | bc -l)
echo "Memory per worker set to ${memory_per_worker}GB"
fi
if [ "$THREADS_PER_WORKER" ]; then
echo "THREADS_PER_WORKER environment variable found. Setting threads per work to $THREADS_PER_WORKER."
threads_per_worker=$THREADS_PER_WORKER
else
echo "THREADS_PER_WORKER environment variable NOT found. Defaulting to 1 thread per worker"
threads_per_worker=1
fi
# Start the dask scheduler & workers
echo "Starting dask-scheduler"
poetry run dask-scheduler > log.txt 2>&1 &
echo "Creating $num_workers dask workers"
for i in `seq $num_workers`
do
poetry run dask-worker \
--nthreads $threads_per_worker \
--memory-limit "${memory_per_worker}GB" \
127.0.0.1:8786 > log.txt 2>&1 &
done
tail -f log.txt
FROM python:3.7.6-buster
# Set the working directory
RUN mkdir /opt/app
WORKDIR /opt/app
# Copy poetry files into docker image
COPY pyproject.toml .
COPY poetry.lock .
# Copy dask-entrypoint.sh into docker image & make the script executable
COPY docker/dask-entrypoint.sh /usr/local/bin/dask-entrypoint.sh
RUN chmod +x /usr/local/bin/dask-entrypoint.sh
# Linux updates & dependencies
RUN apt-get update -y
RUN apt-get install -y libpq-dev
RUN apt-get install -y pandoc
# Build poetry environment
RUN pip install poetry
RUN poetry config virtualenvs.in-project false
RUN poetry config virtualenvs.path ~/.virtualenvs
RUN poetry install --no-root
# dask-entrypoint.sh will start up the dask scheduler & workers
ENTRYPOINT ["/usr/local/bin/dask-entrypoint.sh"]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment