Skip to content

Instantly share code, notes, and snippets.

@tomz
Created October 29, 2016 18:38
Show Gist options
  • Save tomz/998027d23796f78a305f744be4ebce55 to your computer and use it in GitHub Desktop.
Save tomz/998027d23796f78a305f744be4ebce55 to your computer and use it in GitHub Desktop.
rstudio_sparkr_emr4-proc.sh
#!/bin/bash
set -x -e
# AWS EMR bootstrap script
# for installing open-source R (www.r-project.org) with RHadoop packages and RStudio on AWS EMR
#
# tested with AMI 4.0.0 (hadoop 2.6.0)
#
# schmidbe@amazon.de
# 24. September 2014
# 2015-07-14 - Tom Zeng tomzeng@amazon.com, modified on top of Christopher Bozeman's "--sparkr" change to add "--sparkr-pkg"
# 2015-07-29 - Tom Zeng tomzeng@amazon.com, converted to AMI 4.0.0 compatible
# 2016-01-15 - Tom Zeng tomzeng@amazon.com, converted to AMI 4.2.0 compatible and added shiny
# 2016-10-07 - Tom Zeng tomzeng@amazon.com, added Sparklyr and improved install speed by 2-3x
##############################
# Usage:
# --no-rstudio - don't install rstudio-server
# --sparklyr - install RStudio's sparklyr package
# --sparkr - install SparkR package
# --shiny - install Shiny server
#
# --user - set user for rstudio, default "hadoop"
# --user-pw - set user-pw for user USER, default "hadoop"
# --rstudio-port - set rstudio port, default 8787
#
# --rexamples - add R examples to the user home dir, default false
# --rhdfs - install rhdfs package, default false
# --plyrmr - install plyrmr package, default false
# --no-updateR - don't update latest R version
# --latestR - install latest R version, default false (build from source - caution, may cause problem with RStudio)
#
# --sparkr-pkg - install deprecated SparkR-pkg package (obsolete, has RDD API)
# check for master node
IS_MASTER=false
if grep isMaster /mnt/var/lib/info/instance.json | grep true;
then
IS_MASTER=true
fi
# error message
error_msg ()
{
echo 1>&2 "Error: $1"
}
# get input parameters
RSTUDIO=true
SHINY=false
REXAMPLES=false
USER="hadoop"
USERPW="hadoop"
PLYRMR=false
RHDFS=false
UPDATER=true
LATEST_R=false
RSTUDIOPORT=8787
SPARKR=false
SPARKR_PKG=false
SPARKLYR=false
while [ $# -gt 0 ]; do
case "$1" in
--sparklyr)
SPARKLYR=true
;;
--rstudio)
RSTUDIO=true
;;
--no-rstudio)
RSTUDIO=false
;;
--shiny)
SHINY=true
;;
--rexamples)
REXAMPLES=true
;;
--plyrmr)
PLYRMR=true
;;
--rhdfs)
RHDFS=true
;;
--updateR)
UPDATER=true
;;
--no-updateR)
UPDATER=false
;;
--latestR)
LATEST_R=true
UPDATER=false
;;
--sparkr)
SPARKR=true
;;
--sparkr-pkg)
SPARKR_PKG=true
;;
--rstudio-port)
shift
RSTUDIOPORT=$1
;;
--user)
shift
USER=$1
;;
--user-pw)
shift
USERPW=$1
;;
-*)
# do not exit out, just note failure
error_msg "unrecognized option: $1"
;;
*)
break;
;;
esac
shift
done
export MAKE='make -j 8'
sudo yum install -y xorg-x11-xauth.x86_64 xorg-x11-server-utils.x86_64 xterm libXt libX11-devel libXt-devel libcurl-devel git
# install latest R version from AWS Repo
if [ "$UPDATER" = true ]; then
sudo yum update R-core R-base R-core-devel R-devel -y
fi
# create rstudio user on all machines
# we need a unix user with home directory and password and hadoop permission
if [ "$USER" != "hadoop" ]; then
sudo adduser $USER
fi
sudo sh -c "echo '$USERPW' | passwd $USER --stdin"
mkdir /mnt/r-stuff
cd /mnt/r-stuff
# update to latest R version
if [ "$LATEST_R" = true ]; then
pushd .
mkdir R-latest
cd R-latest
wget http://cran.r-project.org/src/base/R-latest.tar.gz
tar -xzf R-latest.tar.gz
sudo yum install -y gcc gcc-c++ gcc-gfortran
sudo yum install -y readline-devel cairo-devel libpng-devel libjpeg-devel libtiff-devel
cd R-3*
./configure --with-readline=yes --enable-R-profiling=no --enable-memory-profiling=no --enable-R-shlib --with-pic --prefix=/usr --without-x --with-libpng --with-jpeglib --with-cairo --enable-R-shlib --with-recommended-packages=yes
make -j 8
sudo make install
sudo su << EOF1
echo '
export PATH=${PWD}/bin:$PATH
' >> /etc/profile
EOF1
popd
fi
sudo sed -i 's/make/make -j 8/g' /usr/lib64/R/etc/Renviron
# set unix environment variables
sudo su << EOF1
echo '
export HADOOP_HOME=/usr/lib/hadoop
export HADOOP_CMD=/usr/bin/hadoop
export HADOOP_STREAMING=/usr/lib/hadoop-mapreduce/hadoop-streaming.jar
export JAVA_HOME=/etc/alternatives/jre
' >> /etc/profile
EOF1
sudo sh -c "source /etc/profile"
# fix hadoop tmp permission
sudo chmod 777 -R /mnt/var/lib/hadoop/tmp
# RCurl package needs curl-config unix package
sudo yum install -y curl-devel
# fix java binding - R and packages have to be compiled with the same java version as hadoop
sudo R CMD javareconf
# install rstudio
# only run if master node
if [ "$IS_MASTER" = true -a "$RSTUDIO" = true ]; then
# install Rstudio server
# please check and update for latest RStudio version
wget https://download2.rstudio.org/rstudio-server-rhel-0.99.491-x86_64.rpm
sudo yum install --nogpgcheck -y rstudio-server-rhel-0.99.491-x86_64.rpm
#wget https://download3.rstudio.org/centos5.9/x86_64/shiny-server-1.4.1.759-rh5-x86_64.rpm
#sudo yum install --nogpgcheck -y shiny-server-1.4.1.759-rh5-x86_64.rpm
# change port - 8787 will not work for many companies
sudo sh -c "echo 'www-port=$RSTUDIOPORT' >> /etc/rstudio/rserver.conf"
sudo perl -p -i -e "s/= 5../= 100/g" /etc/pam.d/rstudio
#sudo rstudio-server restart
fi
# add examples to user
# only run if master node
if [ "$IS_MASTER" = true -a "$REXAMPLES" = true ]; then
# and copy R example scripts to user's home dir amd set permission
wget --no-check-certificate https://raw.githubusercontent.com/tomz/emr-bootstrap-actions/master/R/Hadoop/examples/rmr2_example.R
wget --no-check-certificate https://raw.githubusercontent.com/tomz/emr-bootstrap-actions/master/R/Hadoop/examples/biganalyses_example.R
wget --no-check-certificate https://raw.githubusercontent.com/tomz/emr-bootstrap-actions/master/R/Hadoop/examples/change_pw.R
#sudo cp -p *.R /home/$USER/.
sudo mv *.R /home/$USER/.
sudo chown $USER:$USER -Rf /home/$USER
fi
# install required packages
sudo R --no-save << EOF
install.packages(c('RJSONIO', 'itertools', 'digest', 'Rcpp', 'functional', 'httr', 'plyr', 'stringr', 'reshape2', 'caTools', 'rJava', 'devtools', 'DBI', 'ggplot2', 'dplyr', 'R.methodsS3', 'Hmisc', 'memoise', 'rjson'),
repos="http://cran.rstudio.com")
# here you can add your required packages which should be installed on ALL nodes
# install.packages(c(''), repos="http://cran.rstudio.com", INSTALL_opts=c('--byte-compile') )
EOF
# install rmr2 package
pushd .
rm -rf RHadoop
mkdir RHadoop
cd RHadoop
curl --insecure -L https://github.com/RevolutionAnalytics/rmr2/releases/download/3.3.1/rmr2_3.3.1.tar.gz | tar zx
sudo R CMD INSTALL --byte-compile rmr2
popd
# install rhdfs package
if [ "$RHDFS" = true ]; then
curl --insecure -L https://raw.github.com/RevolutionAnalytics/rhdfs/master/build/rhdfs_1.0.8.tar.gz | tar zx
sudo R CMD INSTALL --byte-compile --no-test-load rhdfs
fi
# install plyrmr package
if [ "$PLYRMR" = true ]; then
curl --insecure -L https://github.com/RevolutionAnalytics/plyrmr/releases/download/0.6.0/plyrmr_0.6.0.tar.gz | tar zx
sudo R CMD INSTALL --byte-compile plyrmr
fi
if [ "$SPARKR" = true ] || [ "$SPARKLYR" = true ] || [ "$SPARKR_PKG" = true ]; then
cat << 'EOF' > /tmp/Renvextra
JAVA_HOME="/etc/alternatives/jre"
HADOOP_HOME_WARN_SUPPRESS="true"
HADOOP_HOME="/usr/lib/hadoop"
HADOOP_PREFIX="/usr/lib/hadoop"
HADOOP_MAPRED_HOME="/usr/lib/hadoop-mapreduce"
HADOOP_YARN_HOME="/usr/lib/hadoop-yarn"
HADOOP_COMMON_HOME="/usr/lib/hadoop"
HADOOP_HDFS_HOME="/usr/lib/hadoop-hdfs"
YARN_HOME="/usr/lib/hadoop-yarn"
HADOOP_CONF_DIR="/usr/lib/hadoop/etc/hadoop/"
YARN_CONF_DIR="/usr/lib/hadoop/etc/hadoop/"
MAHOUT_HOME="/usr/lib/mahout"
MAHOUT_CONF_DIR="/usr/lib/mahout/conf"
MAHOUT_LOG_DIR="/mnt/var/log/mahout"
HIVE_HOME="/usr/lib/hive"
HIVE_CONF_DIR="/usr/lib/hive/conf"
HBASE_HOME="/usr/lib/hbase"
HBASE_CONF_DIR="/usr/lib/hbase/conf"
IMPALA_HOME="/usr/lib/impala"
IMPALA_CONF_DIR="/usr/lib/impala/conf"
SPARK_HOME="/usr/lib/spark"
SPARK_CONF_DIR="/usr/lib/spark/conf"
PATH=${PWD}:${PATH}
EOF
cat /tmp/Renvextra | sudo tee -a /usr/lib64/R/etc/Renviron
# wait SparkR file to show up
while [ ! -d /usr/lib/spark/R/lib/SparkR ]
do
sleep 5
done
sleep 5
fi
# install SparkR or the out-dated SparkR-pkg
if [ "$SPARKR" = true ] || [ "$SPARKR_PKG" = true ]; then
#the following are needed only if not login in as hadoop
sudo mkdir /mnt/spark
sudo chmod a+rwx /mnt/spark
if [ -d /mnt1 ]; then
sudo mkdir /mnt1/spark
sudo chmod a+rwx /mnt1/spark
fi
if [ "$SPARKR" = true ]; then
sudo R --no-save << EOF
library(devtools)
install('/usr/lib/spark/R/lib/SparkR')
# here you can add your required packages which should be installed on ALL nodes
# install.packages(c(''), repos="http://cran.rstudio.com", INSTALL_opts=c('--byte-compile') )
EOF
else
pushd .
git clone https://github.com/amplab-extras/SparkR-pkg.git
cd SparkR-pkg
git checkout sparkr-sql # Spark 1.4 support is in this branch
sudo su << EOF
echo '
export PATH=${PWD}:$PATH
' >> /etc/profile
EOF
#wait file to show up
while [ ! -f /usr/lib/hadoop-lzo/lib/hadoop-lzo.jar -o ! -d /usr/lib/hadoop/client ]
do
sleep 5
done
sleep 5
# copy the emr dependencies to the SBT unmanaged jars directory
mkdir pkg/src/lib
cp /usr/lib/hadoop-lzo/lib/hadoop-lzo.jar pkg/src/lib
cp /usr/lib/hadoop/client/hadoop-mapreduce-client-core-2.6.0-amzn-*.jar pkg/src/lib
wget http://central.maven.org/maven2/com/typesafe/sbt/sbt-launcher/0.13.6/sbt-launcher-0.13.6.jar
# fix the corrupted sbt-launch-0.13.6.jar in the github repo
cp sbt-launcher-0.13.6.jar pkg/src/sbt/sbt-launch-0.13.6.jar
# build againt Spark 1.4 and YARN/Hadoop 2.6
USE_YARN=1 SPARK_VERSION=1.4.0 SPARK_YARN_VERSION=2.6.0 SPARK_HADOOP_VERSION=2.6.0 ./install-dev.sh
sudo R --no-save << EOF
install.packages('testthat',repos="http://cran.rstudio.com")
library(devtools)
install('${PWD}/pkg/R/SparkR')
# here you can add your required packages which should be installed on ALL nodes
# install.packages(c(''), repos="http://cran.rstudio.com", INSTALL_opts=c('--byte-compile') )
install.packages('randomForest',repos="http://cran.rstudio.com")
install.packages('caret',repos="http://cran.rstudio.com")
install.packages('pROC',repos="http://cran.rstudio.com")
EOF
popd
fi
fi
if [ "$SPARKLYR" = true ]; then
sudo R --no-save << EOF
install.packages(c('sparklyr', 'nycflights13', 'Lahman', 'data.table'),
repos="http://cran.rstudio.com" )
EOF
fi
if [ "$IS_MASTER" = true -a "$SHINY" = true ]; then
# install Shiny server
wget https://download3.rstudio.org/centos5.9/x86_64/shiny-server-1.4.1.759-rh5-x86_64.rpm
sudo yum install --nogpgcheck -y shiny-server-1.4.1.759-rh5-x86_64.rpm
sudo R --no-save << EOF
install.packages(c('shiny'),
repos="http://cran.rstudio.com")
EOF
fi
if [ "$USER" != "hadoop" ]; then
while [ ! -f /usr/bin/hdfs ]
do
sleep 5
done
sudo -u hdfs hadoop fs -mkdir /user/$USER
sudo -u hdfs hadoop fs -chown root /user/$USER
sudo -u hdfs hdfs dfs -chmod -R 777 /user/$USER
fi
sudo rstudio-server restart
echo "rstudio server and packages installation completed"
@StevenMMortimer
Copy link

StevenMMortimer commented Dec 16, 2016

@tomz I'm having an issue with the script at line 287. It just hangs waiting for the SparkR file to show up. I assume it's something with the way EMR deploys Spark on the cluster. Any hints on how to resolve?

+ sudo tee -a /usr/lib64/R/etc/Renviron
+ '[' '!' -d /usr/lib/spark/R/lib/SparkR ']'
+ sleep 5
+ '[' '!' -d /usr/lib/spark/R/lib/SparkR ']'
+ sleep 5
+ '[' '!' -d /usr/lib/spark/R/lib/SparkR ']'
+ sleep 5
+ '[' '!' -d /usr/lib/spark/R/lib/SparkR ']'
+ sleep 5

This continues until the cluster rollsback.

After some research it appears that /usr/lib/spark/R is not created on the slave nodes, although R is installed and can be accessed via R command

@conangal
Copy link

Thanks for posting this Tom. Is there a more up-to-date version available? I read something on your AWS blog post about a --cloudyr option?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment